Skip to content
Pull Latest S3 Data
# Configuration
s3_path = "s3://advana-data-zone/builder/bld/edl/"
file_prefix = "FINAL_i-PMR_"
table_name = "blade_workspace.FINAL_i_PMR"
# Step 1: List all files
files = dbutils.fs.ls(s3_path)
# Step 2: Filter for matching CSV files
pmr_files = [f.path for f in files if f.name.startswith(file_prefix) and f.name.endswith(".csv")]
# Step 3: Check if any files were found
if not pmr_files:
raise Exception(f"No files found matching pattern: {s3_path}{file_prefix}*.csv")
# Step 4: Get the latest file
latest_file = sorted(pmr_files)[-1]
print(f"Found {len(pmr_files)} matching files")
print(f"Latest file: {latest_file}")
# Step 5: Recreate the table
spark.sql(f"DROP TABLE IF EXISTS {table_name}")
spark.sql(f"""
CREATE TABLE {table_name}
USING csv
OPTIONS (
header 'true',
ignoreLeadingWhiteSpace 'true',
ignoreTrailingWhiteSpace 'true',
inferSchema 'true'
)
LOCATION '{latest_file}'
""")
print(f"Table {table_name} created successfully!")
# Step 6: Verify by showing a preview
display(spark.sql(f"SELECT * FROM {table_name} LIMIT 5"))