How do I extract/view the csv files created by this block of code? What the code does is pull data from a table and extract the json elements. I am running this code on Jupyter notebook but I can't figure out how to view the csv file.
print('Conducting ETL for all events data')
class CsvBuilder:
def __init__(self, fileName: str):
self._fileName = fileName
self._headers = []
self._rows = []
def hasHeaders(self) -> bool:
return len(self._headers) > 0
def setHeaders(self, headers):
self._headers = headers
def addRow(self, row):
self._rows.append(row)
def write(self):
file_existed = os.path.isfile(self._fileName)
with open(self._fileName, 'a') as outputFile:
writer = csv.writer(outputFile)
if not file_existed:
writer.writerow(self._headers)
writer.writerows(self._rows)
class NullBuilder(CsvBuilder):
def __init__(self):
super().__init__(None)
def hasHeaders(self) -> bool:
pass
def setHeaders(self, headers):
pass
def addRow(self, row):
pass
def write(self):
pass
class CsvBuilderProvider:
def __init__(self):
self._builders = {}
self._defaultBuilder = NullBuilder()
def registerBuilder(self, event: str, builder: CsvBuilder) -> 'CsvBuilderProvider':
self._builders[event] = builder
return self
def getBuilderForEvent(self, event: str):
if event in self._builders.keys():
return self._builders[event]
return self._defaultBuilder
def isJson(value) :
return str(value)[0] == '{'
for start_date in weeks:
end_date = start_date dt.timedelta(days=7)
print('Querying for date range: {} to {}'.format(start_date, end_date))
query_data = """
SELECT events
FROM webschema.data
WHERE created_at >= DATE('{start_date}')
AND created_at < DATE('{end_date}')
;
""".format(start_date=start_date, end_date=end_date)
cursor.execute(query_data)
rows = cursor.fetchall()
print('Query completed')
csvBuilderProvider = CsvBuilderProvider()
csvBuilderProvider.registerBuilder('open', CsvBuilder('open.csv')) \
.registerBuilder('send', CsvBuilder('send.csv')) \
.registerBuilder('click', CsvBuilder('click.csv')) \
.registerBuilder('soft_bounce', CsvBuilder('soft_bounce.csv')) \
.registerBuilder('hard_bounce', CsvBuilder('hard_bounce.csv'))
print('Processing data')
csvBuilders = {}
processed = 0
errored = 0
for row in rows:
try:
jsonData = json.loads(row[0])
for event in jsonData:
eventName = event["event"]
csvBuilder = csvBuilders.setdefault(eventName, csvBuilderProvider.getBuilderForEvent(eventName))
if not csvBuilder.hasHeaders():
csvBuilder.setHeaders(event.keys())
newData = []
for value in event.values():
quotedValue = json.dumps(value) if isJson(value) else value
newData.append(quotedValue)
csvBuilder.addRow(newData)
processed = 1
except Exception as e:
errored = 1
print('Appending processed data to CSV')
for builder in csvBuilders.values():
builder.write()
I don't have a clue how to extract the csv files. It says file not found when I try to run
pd.read_csv('open.csv')
CodePudding user response:
You can add a current working directory to your paths, pathlib module is a good choice for that:
import pathlib
CWD = pathlib.Path().resolve()
...
CsvBuilder(CWD / 'open.csv')
...
pd.read_csv(CWD / 'open.csv')
CodePudding user response:
Doesn't this work?
pd.read_csv(csvBuilderProvider.getBuilderForEvent("open")._fileName)
PS: Your code sample has incorrect indentation, also NullBuilder is poorly written, it initialises with an invalid file name, so the least it should do is raise an error (e.g. RuntimeError) when write() is called.
CodePudding user response:
Lambda stores files locally on the /tmp folder.
So in order to open the file, you would need to do something like this:
import os
os.chdir('/tmp')
pd.read_csv('open.csv')
