I have a folder of csv files that I'd like to loop over to create individual DataFrames named after the file itself.
So if I have file_1.csv, file_2.csv, file_3.csv ... I'd like DataFrames created for each file and have the df named after the file of the data it contains.
Here is what I've tried so far:
# get list of all files
all_files = os.listdir("./Data/")
# get list of only csv files
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
# remove file extension to get name only
file_names = []
for i in csv_files:
file = i[:-4]
file_names.append(file)
# create DataFrames from each file named after the corresonding file
dfs = []
def make_files_dfs():
for a,b in zip(file_names, csv_files):
if a == b[:-4]:
a = pd.read_csv(eval(f"'Data/{b}'"))
dfs.append(a)
error log:
--------------------------------------------------------------------------- ParserError Traceback (most recent call
last) ~\AppData\Local\Temp/ipykernel_592/2054074323.py in <module>
----> 1 make_files_dfs()
~\AppData\Local\Temp/ipykernel_592/3264801573.py in make_files_dfs()
3 for a,b in zip(file_names, csv_files):
4 if a == b[:-4]:
----> 5 a = pd.read_csv(eval(f"'Data/{b}'"))
6 dfs.append(a)
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\util\_decorators.py
in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in read_csv(filepath_or_buffer, sep, delimiter, header, names,
index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine,
converters, true_values, false_values, skipinitialspace, skiprows,
skipfooter, nrows, na_values, keep_default_na, na_filter, verbose,
skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
thousands, decimal, lineterminator, quotechar, quoting, doublequote,
escapechar, comment, encoding, encoding_errors, dialect,
error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in _read(filepath_or_buffer, kwds)
486
487 with parser:
--> 488 return parser.read(nrows)
489
490
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\readers.py
in read(self, nrows) 1045 def read(self, nrows=None): 1046
nrows = validate_integer("nrows", nrows)
-> 1047 index, columns, col_dict = self._engine.read(nrows) 1048 1049 if index is None:
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py
in read(self, nrows)
221 try:
222 if self.low_memory:
--> 223 chunks = self._reader.read_low_memory(nrows)
224 # destructive to chunks
225 data = _concatenate_chunks(chunks)
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader.read_low_memory()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader._read_rows()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.TextReader._tokenize_rows()
~\miniconda3\envs\selenium_env\lib\site-packages\pandas\_libs\parsers.pyx
in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 70 fields in line 7728, saw 74
CodePudding user response:
Your code is a bit difficult to understand. You have some unnecessary functions. First of all, it is easier to change the working directory path (by os.chdir(path). Secondly, you can get rid of your lambda function and use glob.glob. Lastly, you cannot make a DataFrame named after a variable. Your dfs list will hold some class names that won't give you much insight into the DataFrame. It is much better to use a dictionary. Overall, this is how your code can look like:
import os
import glob
path = "the path to your data"
os.chdir(path)
# get list of only csv files
csv_files = glob.glob("/*.csv")
# create a dictionary with key as the DF name and values as DataFrames
dataFrameDictionary={}
def make_files_dfs():
for a in csv_files:
dataFrameDictionary[a[:-4], pd.read_csv(a)]
CodePudding user response:
I don't understand why your code is so lengthy, but this can be done by following:
csv_list = ['file_1.csv', 'file_2.csv', 'file_3.csv']
for i in range(len(csv_list)):
globals()[f"df_{i}"] = pd.read_csv(csv_list[i])
Output:
Three dataframes will be created. df_1 will have 1st file in the list, df_2 will have 2nd file in the list and so on..
