Home > Software design >  How can I get regular expressions to select appropriate street names using stringr in R?
How can I get regular expressions to select appropriate street names using stringr in R?

Time:01-06

I am just getting started with regular expressions (working with the stringr package), and I have written some code that does not do exactly what I want it to do. I am working with a dataset with some very messy string data and am trying to clean it up to be used with a google maps API.

I've attached a sample of the data below.

Basically, I want to select every row where loc_01 is a simple street name. By this, I mean I want it to take on the following formats:

A numbered street, such as 10th Ave; A named street, such as MAIN ST, and any directional modification of such street names (such as 10TH AVE NW, W MAIN ST, or W 10TH AVE.)

I have tried the following expression:

df %>% filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$"))

But this gives me outputs such as 10 AVE 1300 BLK E, which is not an observation I want to select. I interpret my regular expression as:

  • "select the rows beginning with a character (optional) - to take care of strings beginning with N, NW, etc, followed by a space (if necessary), a 'word' of any length beginning with any character (for 10th st or MLK Ave), followed by a space and the suffixes 'AVE, ST, or BLVD', and an optional word (to take care of 10th st W and the like)."

Clearly, my interpretation is wrong since I am getting things like 10 AVD 1300 BLK E. What is the correct regular expression to use in this case to get what I want?

Thank you very much for the help!

structure(list(ID = c("387", "404", "422", "425", "432", "443", 
"526", "536", "580", "658", "665", "666", "735", "880", "910", 
"911", "912", "913", "916", "917", "972", "1098", "1194", "1231", 
"1298", "1309", "1310", "1311", "1312", "1316", "1328", "1354", 
"1371", "1373", "1374", "1376", "1381", "1388", "1389", "1390", 
"1391", "1392", "1393", "1406", "1407", "1408", "1409", "1410", 
"1411", "1412", "1413", "1414", "1418", "1420", "1422", "1429", 
"1430", "1433", "1434", "1437", "1441", "1442", "1443", "1444", 
"1445", "1448", "1451", "1452", "1453", "1454", "1455", "1457", 
"1461", "1462", "1463", "1464", "1466", "1468", "1470", "1471", 
"1473", "1479", "1480", "1481", "1486", "1489", "1490", "1493", 
"1495", "1496", "1498", "1502", "1503", "1509", "1511", "1512", 
"1513", "1517", "1", "2"), city = c("DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", 
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER"), 
    loc_01 = c("#50 S KALAMATH ST", "00 BLKS BRYANT CANOSA", 
    "000 BLK ALLEY", "000 BLK BROADWAY", "000 BLK E 11TH AV", 
    "000 BLK E 17TH", "000 BLK S BROADWAY", "000 BLK S IRVING JULIAN", 
    "000 BLK W ALAMEDA AV", "10 AVE 1300 BLK E", "100 BLK ALLEY N BROADWAY/N ACOMA", 
    "100 BLK ALLEY S", "100 BLK N WASHINGTON ST", "1000 ALLEY LINCOLN/BROADWAY", 
    "1000 BLK ALLEY CHEROKEE/DELAWARE", "1000 BLK ALLEY GRANT", 
    "1000 BLK ALLEY MARTIN/LAFAYETT", "1000 BLK ALLEY MONROE/GARFIELD", 
    "1000 BLK ALLEY OGDEN", "1000 BLK ALLEY S GAYLORD ST", "1000 BLK E GAY", 
    "1000 BLK S VINE/GAYLORD ALLEY", "1010 CURTIS ST", "1050 ODELL ST", 
    "109TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", 
    "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "E 10TH AVE", 
    "MAIN ST NW"), link = c("", "", "", "00125FN", "00025FW", 
    "AT", "", "00050FS", "00005FW", "00100FE", "00043FN", "", 
    "", "", "", "AT", "", "00120FS", "", "00070FN", "", "00200FS", 
    "", "", "00020FS", "09999FN", "AT", "AT", "AT", "AT", "AT", 
    "AT", "AT", "00080FW", "00175FW", "AT", "00101FW", "AT", 
    "AT", "AT", "AT", "AT", "AT", "00060FE", "00120FS", "AT", 
    "AT", "AT", "AT", "00015FW", "00035FW", "00075FW", "00022FE", 
    "00144FW", "00250FE", "AT", "AT", "00037FW", "00100FE", "00200FW", 
    "AT", "AT", "00084FW", "00100FW", "AT", "00100FN", "AT", 
    "AT", "AT", "AT", "AT", "00100FW", "00068FE", "00136FE", 
    "00200FE", "00150FW", "AT", "00020FE", "00020FW", "00030FE", 
    "00045FW", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "00163FE", 
    "AT", "AT", "AT", "AT", "AT", "AT", "AT", "00100FE", "00020FW", 
    "", ""), loc_02 = c("N SIDE OF BLDG", "ALLEY", "KNOX CT/KING ST", 
    "IRVINGTON PL", "N LINDA ST", "POLE 94 79", "PKG METER BS-46", 
    "ALLEY", "POLE 844/005", "MARION ST N", "W 1ST AV", "MEADE/NEWTON", 
    "E 1ST AVE", "10 AV", "W 11TH AVE", "LOGAN ST", "E 11TH AV", 
    "E 11TH AVE", "CORONA ST", "E MISSISSIPPI AVE", "CORONA ST", 
    "E TENNESSEE AVE", "31ST STREET", "E 11TH AVE", "QUEENSBURG ST", 
    "10TH AVE 1407 E", "10TH AVE 2900 BLK W", "10TH AVE 300 BLK E", 
    "10TH AVE 3200 BLK W", "1295 W", "2900 W", "500 E", "ACOMA / BANNOCK ALLEY", 
    "ACOMA ST", "ACOMA ST", "ADAMS ST", "BANNOCK ST", "BANNOCK ST", 
    "BANNOCK ST", "BANNOCK ST", "BANNOCK ST", "BANNOCK ST", "BANNOCK ST N", 
    "BROADWAY ST", "BROADWAY ST", "BROADWAY ST", "BROADWAY ST", 
    "BROADWAY ST", "BROADWAY ST", "BROADWAY ST N", "BRYANT ST", 
    "BRYANT ST", "CLARKSON ST", "CLARKSON ST", "CLARKSON ST", 
    "CLARKSON ST", "CLARKSON ST", "CORONA ST", "CORONA ST", "CORONA ST", 
    "CORONA ST", "CORONA ST", "CORONA ST N", "DECATUR ST", "DECATUR ST", 
    "DOWNING ST", "DOWNING ST", "DOWNING ST", "DOWNING ST", "DOWNING ST", 
    "DOWNING ST", "DOWNING ST N", "FEDERAL BLVD", "FEDERAL BLVD", 
    "FEDERAL BLVD", "GALAPAGO ST", "GARFIELD ST", "GRANT ST", 
    "GRANT ST", "GRANT ST", "GRANT ST", "GRANT ST", "GRANT ST", 
    "GRANT ST", "GROVE ST", "GROVE ST", "GROVE ST", "HOOKER ST", 
    "HUMBOLDT ST", "HUMBOLDT ST", "INCA ST", "KALAMATH ST", "KALAMATH ST", 
    "KNOX CT", "KNOX CT", "KNOX CT", "LAFAYETTE ST", "LINCOLN ST", 
    "MAIN ST", "100TH AVE")), row.names = c(NA, -100L), class = "data.frame")

CodePudding user response:

One way could be to add an additional filter statement (though I'm sure there's a better way to do it).

library(tidyverse)

df %>%
    filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
    filter(!str_detect(loc_01, 'BLK'))

Output

     ID   city            loc_01    link                loc_02
1   387 DENVER #50 S KALAMATH ST                N SIDE OF BLDG
2  1194 DENVER    1010 CURTIS ST                   31ST STREET
3  1231 DENVER     1050 ODELL ST                    E 11TH AVE
4  1298 DENVER         109TH AVE 00020FS         QUEENSBURG ST
5  1309 DENVER          10TH AVE 09999FN       10TH AVE 1407 E
6  1310 DENVER          10TH AVE      AT   10TH AVE 2900 BLK W
7  1311 DENVER          10TH AVE      AT    10TH AVE 300 BLK E
8  1312 DENVER          10TH AVE      AT   10TH AVE 3200 BLK W
9  1316 DENVER          10TH AVE      AT                1295 W
10 1328 DENVER          10TH AVE      AT                2900 W
11 1354 DENVER          10TH AVE      AT                 500 E
12 1371 DENVER          10TH AVE      AT ACOMA / BANNOCK ALLEY
13 1373 DENVER          10TH AVE 00080FW              ACOMA ST
14 1374 DENVER          10TH AVE 00175FW              ACOMA ST
15 1376 DENVER          10TH AVE      AT              ADAMS ST
16 1381 DENVER          10TH AVE 00101FW            BANNOCK ST
17 1388 DENVER          10TH AVE      AT            BANNOCK ST
18 1389 DENVER          10TH AVE      AT            BANNOCK ST
19 1390 DENVER          10TH AVE      AT            BANNOCK ST
20 1391 DENVER          10TH AVE      AT            BANNOCK ST
21 1392 DENVER          10TH AVE      AT            BANNOCK ST
22 1393 DENVER          10TH AVE      AT          BANNOCK ST N
23 1406 DENVER          10TH AVE 00060FE           BROADWAY ST
24 1407 DENVER          10TH AVE 00120FS           BROADWAY ST
25 1408 DENVER          10TH AVE      AT           BROADWAY ST
26 1409 DENVER          10TH AVE      AT           BROADWAY ST
27 1410 DENVER          10TH AVE      AT           BROADWAY ST
28 1411 DENVER          10TH AVE      AT           BROADWAY ST
29 1412 DENVER          10TH AVE 00015FW         BROADWAY ST N
30 1413 DENVER          10TH AVE 00035FW             BRYANT ST
31 1414 DENVER          10TH AVE 00075FW             BRYANT ST
32 1418 DENVER          10TH AVE 00022FE           CLARKSON ST
33 1420 DENVER          10TH AVE 00144FW           CLARKSON ST
34 1422 DENVER          10TH AVE 00250FE           CLARKSON ST
35 1429 DENVER          10TH AVE      AT           CLARKSON ST
36 1430 DENVER          10TH AVE      AT           CLARKSON ST
37 1433 DENVER          10TH AVE 00037FW             CORONA ST
38 1434 DENVER          10TH AVE 00100FE             CORONA ST
39 1437 DENVER          10TH AVE 00200FW             CORONA ST
40 1441 DENVER          10TH AVE      AT             CORONA ST
41 1442 DENVER          10TH AVE      AT             CORONA ST
42 1443 DENVER          10TH AVE 00084FW           CORONA ST N
43 1444 DENVER          10TH AVE 00100FW            DECATUR ST
44 1445 DENVER          10TH AVE      AT            DECATUR ST
45 1448 DENVER          10TH AVE 00100FN            DOWNING ST
46 1451 DENVER          10TH AVE      AT            DOWNING ST
47 1452 DENVER          10TH AVE      AT            DOWNING ST
48 1453 DENVER          10TH AVE      AT            DOWNING ST
49 1454 DENVER          10TH AVE      AT            DOWNING ST
50 1455 DENVER          10TH AVE      AT            DOWNING ST
51 1457 DENVER          10TH AVE 00100FW          DOWNING ST N
52 1461 DENVER          10TH AVE 00068FE          FEDERAL BLVD
53 1462 DENVER          10TH AVE 00136FE          FEDERAL BLVD
54 1463 DENVER          10TH AVE 00200FE          FEDERAL BLVD
55 1464 DENVER          10TH AVE 00150FW           GALAPAGO ST
56 1466 DENVER          10TH AVE      AT           GARFIELD ST
57 1468 DENVER          10TH AVE 00020FE              GRANT ST
58 1470 DENVER          10TH AVE 00020FW              GRANT ST
59 1471 DENVER          10TH AVE 00030FE              GRANT ST
60 1473 DENVER          10TH AVE 00045FW              GRANT ST
61 1479 DENVER          10TH AVE      AT              GRANT ST
62 1480 DENVER          10TH AVE      AT              GRANT ST
63 1481 DENVER          10TH AVE      AT              GRANT ST
64 1486 DENVER          10TH AVE      AT              GROVE ST
65 1489 DENVER          10TH AVE      AT              GROVE ST
66 1490 DENVER          10TH AVE      AT              GROVE ST
67 1493 DENVER          10TH AVE      AT             HOOKER ST
68 1495 DENVER          10TH AVE 00163FE           HUMBOLDT ST
69 1496 DENVER          10TH AVE      AT           HUMBOLDT ST
70 1498 DENVER          10TH AVE      AT               INCA ST
71 1502 DENVER          10TH AVE      AT           KALAMATH ST
72 1503 DENVER          10TH AVE      AT           KALAMATH ST
73 1509 DENVER          10TH AVE      AT               KNOX CT
74 1511 DENVER          10TH AVE      AT               KNOX CT
75 1512 DENVER          10TH AVE      AT               KNOX CT
76 1513 DENVER          10TH AVE 00100FE          LAFAYETTE ST
77 1517 DENVER          10TH AVE 00020FW            LINCOLN ST
78    1 DENVER        E 10TH AVE                       MAIN ST
79    2 DENVER        MAIN ST NW                     100TH AVE

If there are multiple strings that cause issues, then you can create a list and put it into the second filter statement. So, if you wanted to remove the row with #50:

remove.list <- paste(c("#", "BLK"), collapse = '|')

df %>%
    filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
    filter(!str_detect(loc_01, remove.list))

Output

head()

     ID   city         loc_01    link                loc_02
1  1194 DENVER 1010 CURTIS ST                   31ST STREET
2  1231 DENVER  1050 ODELL ST                    E 11TH AVE
3  1298 DENVER      109TH AVE 00020FS         QUEENSBURG ST
4  1309 DENVER       10TH AVE 09999FN       10TH AVE 1407 E
5  1310 DENVER       10TH AVE      AT   10TH AVE 2900 BLK W
6  1311 DENVER       10TH AVE      AT    10TH AVE 300 BLK E
7  1312 DENVER       10TH AVE      AT   10TH AVE 3200 BLK W
8  1316 DENVER       10TH AVE      AT                1295 W
9  1328 DENVER       10TH AVE      AT                2900 W
10 1354 DENVER       10TH AVE      AT                 500 E

To filter the loc_02, we can add an additional filter statement that keeps rows that begin with a number and end with a direction.

df %>%
  filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
  filter(!str_detect(loc_01, 'BLK')) %>%
  filter(str_detect(loc_02, "^[[:digit:]] ( N| S| E| W| NE| NW| SE| SW)$"))

# Or you could write it like this:
# df %>%
#   filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
#   filter(!str_detect(loc_01, 'BLK')) %>%
#   filter(str_detect(loc_02, paste("^\\d (\\s)", "(", direction_abbrev, ")","$", sep = "")))

Output

    ID   city   loc_01 link loc_02
1 1316 DENVER 10TH AVE   AT 1295 W
2 1328 DENVER 10TH AVE   AT 2900 W
3 1354 DENVER 10TH AVE   AT  500 E
  •  Tags:  
  • Related