I am working on a project for university in which I want to use the associations beetween two dataframes.
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
dataframe_one = pd.DataFrame({'city_name': ['LISBOA', 'LISBOA', 'RIO DE JANEIRO',
'BARCELONA', 'PARIS'],
'address_name':['ESPLANADA DOM CARLOS',
'AVENIDA BRASILIA',
'PARQUE NACIONAL DA TIJUCA',
'C D ARISTIDES MAILLOL',
'CHAMP DE MARS 5 AV ANATOLE']})
dataframe_two = pd.DataFrame({'city': ['RIO DE JANEIRO', 'RIO DE JANEIRO', 'LISBOA'],
'population': [6748, 6748, 504],
'address':['PQ NACIONAL TIJUCA', 'URCA',
'ESPLANADAA DOM CARLO']})
What I want to do is compare the two dataframes. So if the city is the same, then analyze the similarity between the address columns. If the address similarity value is greater than a THRESHOLD, then I assign values in the new columns. I did the following:
THRESHOLD = 80
dataframe_two['NewColumn1'] = None
dataframe_two['NewColumn2'] = 0
for i in range(0, len(dataframe_one)):
for j in range(0, len(dataframe_two)):
if(dataframe_one['city_name'].iloc[i] == dataframe_two['city'].iloc[j]):
adress1 = dataframe_one['address_name'].iloc[i]
adress2 = dataframe_two['address'].iloc[j]
value_fuzzy = fuzz.ratio(adress1, adress2)
if(value_fuzzy > THRESHOLD):
dataframe_two['NewColumn1'].loc[j] = \
dataframe_one['address_name'].iloc[i]
dataframe_two['NewColumn2'].loc[j] = value_fuzzyvalue_fuzzy
The implementation outputs to the second dataframe as expected, but I would like to replace it with more efficient code.
Output:
city population address NewColumn1 NewColumn2
RIO DE JANEIRO 6748 PQ NACIONAL TIJUCA PARQUE NACIONAL DA TIJUCA 84
RIO DE JANEIRO 6748 URCA None 0
LISBOA 504 ESPLANADAA DOM CARLO ESPLANADA DOM CARLOS 95
CodePudding user response:
Here is another implementation that finds the best match in you initial dataframe.
l = []
TRESHOLD = 80
# ITERATE OVER THE SECOND DATAFRAME
for _ ,row in dataframe_two.iterrows():
# CREATE NEW ROW FOR THE RESULT
d = {**row, 'NewColumn1':None, 'NewColumn2':None}
# FIND MATCHES
d1_subset = dataframe_one.loc[dataframe_one['city_name'] == row['city']]
# ITERATE OVER THE MATCHES
if d1_subset.shape[0] > 0:
best_adress = None
best_score = 0
adresses = d1_subset['address_name'].tolist()
# FIND BEST MATCH
for adress in adresses:
value_fuzzy = fuzz.ratio(adress, row['address'])
if value_fuzzy > best_score:
best_adress = adress
best_score = value_fuzzy
# IF BEST SCORE IS ABOVE THRESHOLD
if best_score > TRESHOLD:
d['NewColumn1'] = best_adress
d['NewColumn2'] = best_score
l.append(d)
# CREATE DATAFRAME FROM THE RESULT
df = pd.DataFrame(l)
