Merge pull request #14 from kevinsunny1996/task/revert_back_to_previo…

…us_flow_post_manual_fix Revert to old flow post column fix
kevinsunny1996 · Apr 30, 2024 · 5b9a1fd · 5b9a1fd
2 parents 9b3092d + 9f56263
commit 5b9a1fd
Showing 1 changed file with 48 additions and 49 deletions.
diff --git a/dags/rawg_api_extractor_dag.py b/dags/rawg_api_extractor_dag.py
@@ -370,53 +370,53 @@ def get_game_id_related_data(api_key: str, game_ids_list: list, page_number: int
 
 
     # Load contents from GCS onto BigQuery for that run
-    # load_rawg_api_ratings_data_to_bq = GCSToBigQueryOperator(
-    #         task_id=f'load_ratings_to_bq',
-    #         bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
-    #         source_objects=[f'ratings_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
-    #         source_format='PARQUET',
-    #         destination_project_dataset_table=f'{rawg_api_bq_dataset}.ratings',  # Set your BigQuery table name to load the data to.
-    #         gcp_conn_id='gcp',  # Set your GCP connection ID.
-    #         allow_quoted_newlines=True,
-    #         ignore_unknown_values=True,
-    #         schema_fields=schema_ratings,
-    #         create_disposition='CREATE_IF_NEEDED',
-    #         autodetect=False,
-    #         write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
-    #         skip_leading_rows=1 # Skip the header row in the CSV file.
-    # )
-
-    # load_rawg_api_games_data_to_bq = GCSToBigQueryOperator(
-    #         task_id=f'load_games_to_bq',
-    #         bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
-    #         source_objects=[f'games_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
-    #         source_format='PARQUET',
-    #         allow_quoted_newlines=True,
-    #         ignore_unknown_values=True,
-    #         destination_project_dataset_table=f'{rawg_api_bq_dataset}.games',  # Set your BigQuery table name to load the data to.
-    #         gcp_conn_id='gcp',  # Set your GCP connection ID.
-    #         create_disposition='CREATE_IF_NEEDED',
-    #         schema_fields=schema_games,
-    #         autodetect=False,
-    #         write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
-    #         skip_leading_rows=1 # Skip the header row in the CSV file.
-    # )
-
-    # load_rawg_api_genres_data_to_bq = GCSToBigQueryOperator(
-    #         task_id=f'load_genres_to_bq',
-    #         bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
-    #         source_objects=[f'genres_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
-    #         source_format='PARQUET',
-    #         allow_quoted_newlines=True,
-    #         ignore_unknown_values=True,
-    #         destination_project_dataset_table=f'{rawg_api_bq_dataset}.genres',  # Set your BigQuery table name to load the data to.
-    #         gcp_conn_id='gcp',  # Set your GCP connection ID.
-    #         create_disposition='CREATE_IF_NEEDED',
-    #         write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
-    #         schema_fields=schema_genres,
-    #         autodetect=False,
-    #         skip_leading_rows=1 # Skip the header row in the CSV file.
-    # )
+    load_rawg_api_ratings_data_to_bq = GCSToBigQueryOperator(
+            task_id=f'load_ratings_to_bq',
+            bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
+            source_objects=[f'ratings_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
+            source_format='PARQUET',
+            destination_project_dataset_table=f'{rawg_api_bq_dataset}.ratings',  # Set your BigQuery table name to load the data to.
+            gcp_conn_id='gcp',  # Set your GCP connection ID.
+            allow_quoted_newlines=True,
+            ignore_unknown_values=True,
+            schema_fields=schema_ratings,
+            create_disposition='CREATE_IF_NEEDED',
+            autodetect=False,
+            write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
+            skip_leading_rows=1 # Skip the header row in the CSV file.
+    )
+
+    load_rawg_api_games_data_to_bq = GCSToBigQueryOperator(
+            task_id=f'load_games_to_bq',
+            bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
+            source_objects=[f'games_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
+            source_format='PARQUET',
+            allow_quoted_newlines=True,
+            ignore_unknown_values=True,
+            destination_project_dataset_table=f'{rawg_api_bq_dataset}.games',  # Set your BigQuery table name to load the data to.
+            gcp_conn_id='gcp',  # Set your GCP connection ID.
+            create_disposition='CREATE_IF_NEEDED',
+            schema_fields=schema_games,
+            autodetect=False,
+            write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
+            skip_leading_rows=1 # Skip the header row in the CSV file.
+    )
+
+    load_rawg_api_genres_data_to_bq = GCSToBigQueryOperator(
+            task_id=f'load_genres_to_bq',
+            bucket=rawg_landing_gcs_bucket,  # Set your GCS bucket name to pick file from.
+            source_objects=[f'genres_{rawg_page_number}.parquet'],  # Set the name of the CSV file in GCS
+            source_format='PARQUET',
+            allow_quoted_newlines=True,
+            ignore_unknown_values=True,
+            destination_project_dataset_table=f'{rawg_api_bq_dataset}.genres',  # Set your BigQuery table name to load the data to.
+            gcp_conn_id='gcp',  # Set your GCP connection ID.
+            create_disposition='CREATE_IF_NEEDED',
+            write_disposition='WRITE_APPEND',  # If the table already exists, BigQuery appends the data to the table.
+            schema_fields=schema_genres,
+            autodetect=False,
+            skip_leading_rows=1 # Skip the header row in the CSV file.
+    )
 
     load_rawg_api_platforms_data_to_bq = GCSToBigQueryOperator(
             task_id=f'load_platforms_to_bq',
@@ -474,8 +474,7 @@ def update_page_number(rawg_page_number: int) -> int:
     clear_extracted_parquet_files = remove_extracted_api_parquet_files(rawg_landing_gcs_bucket)
     next_page_number = update_page_number(rawg_page_number)
 
-    game_ids_list >> game_details_extractor  >> load_rawg_api_platforms_data_to_bq >> load_rawg_api_publishers_data_to_bq >> clear_extracted_parquet_files >> next_page_number
-# >> load_rawg_api_ratings_data_to_bq >> load_rawg_api_games_data_to_bq >> load_rawg_api_genres_data_to_bq
+    game_ids_list >> game_details_extractor >> load_rawg_api_ratings_data_to_bq >> load_rawg_api_games_data_to_bq >> load_rawg_api_genres_data_to_bq >> load_rawg_api_platforms_data_to_bq >> load_rawg_api_publishers_data_to_bq >> clear_extracted_parquet_files >> next_page_number
 
 rawg_api_extractor_dag()