While merging 100 CSV files, how to fill nan in a column if it doesn't exist in "usecol&q-CodePudding

Considering that I have CSV files which looks roughly like this

df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
                   'ColB': [80, 75, 70, 65]})

I am using the following script which was suggested here

import pandas as pd
import glob

path = r'path/' # use your path
all_files = glob.glob(path   "/*.csv")

fields = ['ColA', 'ColB', 'ColC']

first_one = True
for filename in all_files:

    if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
        skip_row = [0]
    else:
        skip_row = []

# works with this version: '1.3.4'

# combine into one
mode = "w"
header = True
for filename in all_files:
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        usecols = fields
    ) as reader:
        for df in reader:
            filename = os.path.basename(filename)
            df["username"] = filename
            df.to_csv("New_File.csv", index=False, mode=mode, header=header)
            mode = "a"
            header = False

Most of the files have all three columns, while few of them do not have ColC. This will give an error (understandably) which is as follows:

ValueError: Usecols do not match columns, columns expected but not found: ['ColC']

How can I put nan in ColC while keep columns list unchanged?

CodePudding user response：

Here is one alternative checking the columns beforehand:

# (...)
for filename in all_files:
    # Check available columns first
    cols = pd.read_csv(filename, engine='python', nrows=0, header=0).columns
    fields_ = cols.intersection(fields)
    missed = [i for i in fields if i not in cols]
    
    with pd.read_csv(
        filename,
        engine="python",
        iterator=True,
        chunksize=10_000,
        header=0,
        usecols = fields_  # Use the "dynamic" one
    ) as reader:
        for df in reader:
            # Manually append missed cols
            if missed:
                 for col in missed:
                    df[col] = np.nan

                # Make sure the order is kept
                df = df[fields]

            # (proceed...)

            filename = os.path.basename(filename)
            df["username"] = filename
# (...)

CodePudding user response：

Changed your example columns list to have multiple missing columns. But without changing columns in the solution using an augmented example file.

import pandas as pd
import re

columns = ['Col1','ColB','ColC','ColD']
try:
    df = pd.read_csv('test.csv',usecols=columns)
except ValueError as e:
    if 'Usecols' not in str(e): raise e
    missing = re.findall(r"'(.*?)'", str(e))
    df = pd.read_csv('test.csv', usecols=set(columns) - set(missing))
    df[missing] = np.nan
df

Output

  Col1  ColB  ColC  ColD
0    A    80   NaN   NaN
1    B    75   NaN   NaN
2    C    70   NaN   NaN
3    D    65   NaN   NaN

Creating an example csv file with extra and missing columns

import pandas as pd

df = pd.DataFrame({'Col1': ['A', 'B', 'C', 'D'],
                   'ColB': [80, 75, 70, 65],
                   'ColE': [100, 20, 1, 23]})
df.to_csv('test.csv', index=False)

test.csv

Col1,ColB,ColE
A,80,100
B,75,20
C,70,1
D,65,23

CodePudding user response：

If the use of 'usecols' is not a necessity, you could achieve this by using .reindex() instead of 'usecols' like so:

columns = ['Col1','ColB','ColC']
df = pd.read_csv('test.csv').reindex(columns=columns)

Output

  Col1  ColB  ColC
0    A    80   NaN
1    B    75   NaN
2    C    70   NaN
3    D    65   NaN