I'm trying to build a program that automatically creates our batch pipelines from MySQL to BigQuery and I'm randomly getting this linting error: "Undefined Variable 'self'".
The DESTINATION_TABLE_FORMAT class constant is not able to access self.environment() as it is giving the undefined variable self error.
When I move data_lake_paths by one indentation inwards, it starts to work but I'm no longer able to access it by calling the variable.
Any help would be much appreciated
class MySQLBatchPipeline:
'''
Class to generate MySQL batch pipelines that store CSV's
in GCS then import them into BigQuery
'''
export_format='CSV'
DESTINATION_TABLE_FORMAT = self.get_environment() '.{dataset}.{table}' #<- linter flags this as "Undefined Variable 'self'"
def __init__(
self,
dag,
sql_directory,
gcp_project_id,
mysql_connection_id,
source_schema,
source_table,
gcs_connection_id,
bq_connection_id,
gcs_bucket,
destination_staging_schema,
destination_schema,
destination_table,
environment,
time_delay,
country,
max_file_size: int=int(50e6),
):
self.dag = dag,
self.sql_directory = sql_directory,
self.gcp_project_id = gcp_project_id,
self.mysql_connection_id = mysql_connection_id,
self.source_schema = source_schema,
self.source_table = source_table,
self.gcs_connection_id = gcs_connection_id,
self.bq_connection_id = bq_connection_id,
self.gcs_bucket = gcs_bucket,
self.destination_staging_schema = destination_staging_schema,
self.destination_schema = destination_schema,
self.destination_table = destination_table,
self.time_delay = time_delay,
self.environment = environment,
self.max_file_size = max_file_size,
self.queries = self.get_pipeline_queries()
self.schema_file = self.get_schema_files()
self.country = country
# variables
data_lake_paths = GoogleCloudStoragePaths(self.destination_table) #<- the same Undefined Variable 'self' error is flagged here.
CodePudding user response:
since
def __init__(self, ...):
...
# this variable
data_lake_paths = GoogleCloudStoragePaths(self.destination_table)
is outside of any class method that accepts a self parameter, python will set this as a class variable and not an instance variable, so no self parameter will be passed (that's why the Undefined Variable 'self' occurs).
Put that inside of the __init__ method or another method and it should work,.
CodePudding user response:
Variables defined straight into the body of a class cannot use self, only those in methods can. Instead, just move the definition of that variable into the __init__ function. If you need to have the variable even when __init__ has not been run, then just set it to None in the class body.
code:
class MySQLBatchPipeline:
'''
Class to generate MySQL batch pipelines that store CSV's
in GCS then import them into BigQuery
'''
export_format='CSV'
DESTINATION_TABLE_FORMAT = self.get_environment() '.{dataset}.{table}'
# Declare here if necessary
data_lake_paths = None
def __init__(
self,
dag,
sql_directory,
gcp_project_id,
mysql_connection_id,
source_schema,
source_table,
gcs_connection_id,
bq_connection_id,
gcs_bucket,
destination_staging_schema,
destination_schema,
destination_table,
environment,
time_delay,
country,
max_file_size: int=int(50e6),
):
self.dag = dag,
self.sql_directory = sql_directory,
self.gcp_project_id = gcp_project_id,
self.mysql_connection_id = mysql_connection_id,
self.source_schema = source_schema,
self.source_table = source_table,
self.gcs_connection_id = gcs_connection_id,
self.bq_connection_id = bq_connection_id,
self.gcs_bucket = gcs_bucket,
self.destination_staging_schema = destination_staging_schema,
self.destination_schema = destination_schema,
self.destination_table = destination_table,
self.time_delay = time_delay,
self.environment = environment,
self.max_file_size = max_file_size,
self.queries = self.get_pipeline_queries()
self.schema_file = self.get_schema_files()
self.country = country
# Define here
data_lake_paths = GoogleCloudStoragePaths(self.destination_table)
