I am just getting started with regular expressions (working with the stringr package), and I have written some code that does not do exactly what I want it to do. I am working with a dataset with some very messy string data and am trying to clean it up to be used with a google maps API.
I've attached a sample of the data below.
Basically, I want to select every row where loc_01 is a simple street name. By this, I mean I want it to take on the following formats:
A numbered street, such as 10th Ave; A named street, such as MAIN ST, and any directional modification of such street names (such as 10TH AVE NW, W MAIN ST, or W 10TH AVE.)
I have tried the following expression:
df %>% filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$"))
But this gives me outputs such as 10 AVE 1300 BLK E, which is not an observation I want to select. I interpret my regular expression as:
- "select the rows beginning with a character (optional) - to take care of strings beginning with N, NW, etc, followed by a space (if necessary), a 'word' of any length beginning with any character (for 10th st or MLK Ave), followed by a space and the suffixes 'AVE, ST, or BLVD', and an optional word (to take care of 10th st W and the like)."
Clearly, my interpretation is wrong since I am getting things like 10 AVD 1300 BLK E. What is the correct regular expression to use in this case to get what I want?
Thank you very much for the help!
structure(list(ID = c("387", "404", "422", "425", "432", "443",
"526", "536", "580", "658", "665", "666", "735", "880", "910",
"911", "912", "913", "916", "917", "972", "1098", "1194", "1231",
"1298", "1309", "1310", "1311", "1312", "1316", "1328", "1354",
"1371", "1373", "1374", "1376", "1381", "1388", "1389", "1390",
"1391", "1392", "1393", "1406", "1407", "1408", "1409", "1410",
"1411", "1412", "1413", "1414", "1418", "1420", "1422", "1429",
"1430", "1433", "1434", "1437", "1441", "1442", "1443", "1444",
"1445", "1448", "1451", "1452", "1453", "1454", "1455", "1457",
"1461", "1462", "1463", "1464", "1466", "1468", "1470", "1471",
"1473", "1479", "1480", "1481", "1486", "1489", "1490", "1493",
"1495", "1496", "1498", "1502", "1503", "1509", "1511", "1512",
"1513", "1517", "1", "2"), city = c("DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER",
"DENVER", "DENVER", "DENVER", "DENVER", "DENVER", "DENVER"),
loc_01 = c("#50 S KALAMATH ST", "00 BLKS BRYANT CANOSA",
"000 BLK ALLEY", "000 BLK BROADWAY", "000 BLK E 11TH AV",
"000 BLK E 17TH", "000 BLK S BROADWAY", "000 BLK S IRVING JULIAN",
"000 BLK W ALAMEDA AV", "10 AVE 1300 BLK E", "100 BLK ALLEY N BROADWAY/N ACOMA",
"100 BLK ALLEY S", "100 BLK N WASHINGTON ST", "1000 ALLEY LINCOLN/BROADWAY",
"1000 BLK ALLEY CHEROKEE/DELAWARE", "1000 BLK ALLEY GRANT",
"1000 BLK ALLEY MARTIN/LAFAYETT", "1000 BLK ALLEY MONROE/GARFIELD",
"1000 BLK ALLEY OGDEN", "1000 BLK ALLEY S GAYLORD ST", "1000 BLK E GAY",
"1000 BLK S VINE/GAYLORD ALLEY", "1010 CURTIS ST", "1050 ODELL ST",
"109TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE",
"10TH AVE", "10TH AVE", "10TH AVE", "10TH AVE", "E 10TH AVE",
"MAIN ST NW"), link = c("", "", "", "00125FN", "00025FW",
"AT", "", "00050FS", "00005FW", "00100FE", "00043FN", "",
"", "", "", "AT", "", "00120FS", "", "00070FN", "", "00200FS",
"", "", "00020FS", "09999FN", "AT", "AT", "AT", "AT", "AT",
"AT", "AT", "00080FW", "00175FW", "AT", "00101FW", "AT",
"AT", "AT", "AT", "AT", "AT", "00060FE", "00120FS", "AT",
"AT", "AT", "AT", "00015FW", "00035FW", "00075FW", "00022FE",
"00144FW", "00250FE", "AT", "AT", "00037FW", "00100FE", "00200FW",
"AT", "AT", "00084FW", "00100FW", "AT", "00100FN", "AT",
"AT", "AT", "AT", "AT", "00100FW", "00068FE", "00136FE",
"00200FE", "00150FW", "AT", "00020FE", "00020FW", "00030FE",
"00045FW", "AT", "AT", "AT", "AT", "AT", "AT", "AT", "00163FE",
"AT", "AT", "AT", "AT", "AT", "AT", "AT", "00100FE", "00020FW",
"", ""), loc_02 = c("N SIDE OF BLDG", "ALLEY", "KNOX CT/KING ST",
"IRVINGTON PL", "N LINDA ST", "POLE 94 79", "PKG METER BS-46",
"ALLEY", "POLE 844/005", "MARION ST N", "W 1ST AV", "MEADE/NEWTON",
"E 1ST AVE", "10 AV", "W 11TH AVE", "LOGAN ST", "E 11TH AV",
"E 11TH AVE", "CORONA ST", "E MISSISSIPPI AVE", "CORONA ST",
"E TENNESSEE AVE", "31ST STREET", "E 11TH AVE", "QUEENSBURG ST",
"10TH AVE 1407 E", "10TH AVE 2900 BLK W", "10TH AVE 300 BLK E",
"10TH AVE 3200 BLK W", "1295 W", "2900 W", "500 E", "ACOMA / BANNOCK ALLEY",
"ACOMA ST", "ACOMA ST", "ADAMS ST", "BANNOCK ST", "BANNOCK ST",
"BANNOCK ST", "BANNOCK ST", "BANNOCK ST", "BANNOCK ST", "BANNOCK ST N",
"BROADWAY ST", "BROADWAY ST", "BROADWAY ST", "BROADWAY ST",
"BROADWAY ST", "BROADWAY ST", "BROADWAY ST N", "BRYANT ST",
"BRYANT ST", "CLARKSON ST", "CLARKSON ST", "CLARKSON ST",
"CLARKSON ST", "CLARKSON ST", "CORONA ST", "CORONA ST", "CORONA ST",
"CORONA ST", "CORONA ST", "CORONA ST N", "DECATUR ST", "DECATUR ST",
"DOWNING ST", "DOWNING ST", "DOWNING ST", "DOWNING ST", "DOWNING ST",
"DOWNING ST", "DOWNING ST N", "FEDERAL BLVD", "FEDERAL BLVD",
"FEDERAL BLVD", "GALAPAGO ST", "GARFIELD ST", "GRANT ST",
"GRANT ST", "GRANT ST", "GRANT ST", "GRANT ST", "GRANT ST",
"GRANT ST", "GROVE ST", "GROVE ST", "GROVE ST", "HOOKER ST",
"HUMBOLDT ST", "HUMBOLDT ST", "INCA ST", "KALAMATH ST", "KALAMATH ST",
"KNOX CT", "KNOX CT", "KNOX CT", "LAFAYETTE ST", "LINCOLN ST",
"MAIN ST", "100TH AVE")), row.names = c(NA, -100L), class = "data.frame")
CodePudding user response:
One way could be to add an additional filter statement (though I'm sure there's a better way to do it).
library(tidyverse)
df %>%
filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
filter(!str_detect(loc_01, 'BLK'))
Output
ID city loc_01 link loc_02
1 387 DENVER #50 S KALAMATH ST N SIDE OF BLDG
2 1194 DENVER 1010 CURTIS ST 31ST STREET
3 1231 DENVER 1050 ODELL ST E 11TH AVE
4 1298 DENVER 109TH AVE 00020FS QUEENSBURG ST
5 1309 DENVER 10TH AVE 09999FN 10TH AVE 1407 E
6 1310 DENVER 10TH AVE AT 10TH AVE 2900 BLK W
7 1311 DENVER 10TH AVE AT 10TH AVE 300 BLK E
8 1312 DENVER 10TH AVE AT 10TH AVE 3200 BLK W
9 1316 DENVER 10TH AVE AT 1295 W
10 1328 DENVER 10TH AVE AT 2900 W
11 1354 DENVER 10TH AVE AT 500 E
12 1371 DENVER 10TH AVE AT ACOMA / BANNOCK ALLEY
13 1373 DENVER 10TH AVE 00080FW ACOMA ST
14 1374 DENVER 10TH AVE 00175FW ACOMA ST
15 1376 DENVER 10TH AVE AT ADAMS ST
16 1381 DENVER 10TH AVE 00101FW BANNOCK ST
17 1388 DENVER 10TH AVE AT BANNOCK ST
18 1389 DENVER 10TH AVE AT BANNOCK ST
19 1390 DENVER 10TH AVE AT BANNOCK ST
20 1391 DENVER 10TH AVE AT BANNOCK ST
21 1392 DENVER 10TH AVE AT BANNOCK ST
22 1393 DENVER 10TH AVE AT BANNOCK ST N
23 1406 DENVER 10TH AVE 00060FE BROADWAY ST
24 1407 DENVER 10TH AVE 00120FS BROADWAY ST
25 1408 DENVER 10TH AVE AT BROADWAY ST
26 1409 DENVER 10TH AVE AT BROADWAY ST
27 1410 DENVER 10TH AVE AT BROADWAY ST
28 1411 DENVER 10TH AVE AT BROADWAY ST
29 1412 DENVER 10TH AVE 00015FW BROADWAY ST N
30 1413 DENVER 10TH AVE 00035FW BRYANT ST
31 1414 DENVER 10TH AVE 00075FW BRYANT ST
32 1418 DENVER 10TH AVE 00022FE CLARKSON ST
33 1420 DENVER 10TH AVE 00144FW CLARKSON ST
34 1422 DENVER 10TH AVE 00250FE CLARKSON ST
35 1429 DENVER 10TH AVE AT CLARKSON ST
36 1430 DENVER 10TH AVE AT CLARKSON ST
37 1433 DENVER 10TH AVE 00037FW CORONA ST
38 1434 DENVER 10TH AVE 00100FE CORONA ST
39 1437 DENVER 10TH AVE 00200FW CORONA ST
40 1441 DENVER 10TH AVE AT CORONA ST
41 1442 DENVER 10TH AVE AT CORONA ST
42 1443 DENVER 10TH AVE 00084FW CORONA ST N
43 1444 DENVER 10TH AVE 00100FW DECATUR ST
44 1445 DENVER 10TH AVE AT DECATUR ST
45 1448 DENVER 10TH AVE 00100FN DOWNING ST
46 1451 DENVER 10TH AVE AT DOWNING ST
47 1452 DENVER 10TH AVE AT DOWNING ST
48 1453 DENVER 10TH AVE AT DOWNING ST
49 1454 DENVER 10TH AVE AT DOWNING ST
50 1455 DENVER 10TH AVE AT DOWNING ST
51 1457 DENVER 10TH AVE 00100FW DOWNING ST N
52 1461 DENVER 10TH AVE 00068FE FEDERAL BLVD
53 1462 DENVER 10TH AVE 00136FE FEDERAL BLVD
54 1463 DENVER 10TH AVE 00200FE FEDERAL BLVD
55 1464 DENVER 10TH AVE 00150FW GALAPAGO ST
56 1466 DENVER 10TH AVE AT GARFIELD ST
57 1468 DENVER 10TH AVE 00020FE GRANT ST
58 1470 DENVER 10TH AVE 00020FW GRANT ST
59 1471 DENVER 10TH AVE 00030FE GRANT ST
60 1473 DENVER 10TH AVE 00045FW GRANT ST
61 1479 DENVER 10TH AVE AT GRANT ST
62 1480 DENVER 10TH AVE AT GRANT ST
63 1481 DENVER 10TH AVE AT GRANT ST
64 1486 DENVER 10TH AVE AT GROVE ST
65 1489 DENVER 10TH AVE AT GROVE ST
66 1490 DENVER 10TH AVE AT GROVE ST
67 1493 DENVER 10TH AVE AT HOOKER ST
68 1495 DENVER 10TH AVE 00163FE HUMBOLDT ST
69 1496 DENVER 10TH AVE AT HUMBOLDT ST
70 1498 DENVER 10TH AVE AT INCA ST
71 1502 DENVER 10TH AVE AT KALAMATH ST
72 1503 DENVER 10TH AVE AT KALAMATH ST
73 1509 DENVER 10TH AVE AT KNOX CT
74 1511 DENVER 10TH AVE AT KNOX CT
75 1512 DENVER 10TH AVE AT KNOX CT
76 1513 DENVER 10TH AVE 00100FE LAFAYETTE ST
77 1517 DENVER 10TH AVE 00020FW LINCOLN ST
78 1 DENVER E 10TH AVE MAIN ST
79 2 DENVER MAIN ST NW 100TH AVE
If there are multiple strings that cause issues, then you can create a list and put it into the second filter statement. So, if you wanted to remove the row with #50:
remove.list <- paste(c("#", "BLK"), collapse = '|')
df %>%
filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
filter(!str_detect(loc_01, remove.list))
Output
head()
ID city loc_01 link loc_02
1 1194 DENVER 1010 CURTIS ST 31ST STREET
2 1231 DENVER 1050 ODELL ST E 11TH AVE
3 1298 DENVER 109TH AVE 00020FS QUEENSBURG ST
4 1309 DENVER 10TH AVE 09999FN 10TH AVE 1407 E
5 1310 DENVER 10TH AVE AT 10TH AVE 2900 BLK W
6 1311 DENVER 10TH AVE AT 10TH AVE 300 BLK E
7 1312 DENVER 10TH AVE AT 10TH AVE 3200 BLK W
8 1316 DENVER 10TH AVE AT 1295 W
9 1328 DENVER 10TH AVE AT 2900 W
10 1354 DENVER 10TH AVE AT 500 E
To filter the loc_02, we can add an additional filter statement that keeps rows that begin with a number and end with a direction.
df %>%
filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
filter(!str_detect(loc_01, 'BLK')) %>%
filter(str_detect(loc_02, "^[[:digit:]] ( N| S| E| W| NE| NW| SE| SW)$"))
# Or you could write it like this:
# df %>%
# filter(str_detect(loc_01, "^(\\w )?(\\s)?.*(\\s)AVE|ST|BLVD(\\w )?$")) %>%
# filter(!str_detect(loc_01, 'BLK')) %>%
# filter(str_detect(loc_02, paste("^\\d (\\s)", "(", direction_abbrev, ")","$", sep = "")))
Output
ID city loc_01 link loc_02
1 1316 DENVER 10TH AVE AT 1295 W
2 1328 DENVER 10TH AVE AT 2900 W
3 1354 DENVER 10TH AVE AT 500 E
