Pull Latest S3 Data

# Configuration
s3_path = "s3://advana-data-zone/builder/bld/edl/"
file_prefix = "FINAL_i-PMR_"
table_name = "blade_workspace.FINAL_i_PMR"

# Step 1: List all files
files = dbutils.fs.ls(s3_path)

# Step 2: Filter for matching CSV files
pmr_files = [f.path for f in files if f.name.startswith(file_prefix) and f.name.endswith(".csv")]

# Step 3: Check if any files were found
if not pmr_files:
    raise Exception(f"No files found matching pattern: {s3_path}{file_prefix}*.csv")

# Step 4: Get the latest file
latest_file = sorted(pmr_files)[-1]
print(f"Found {len(pmr_files)} matching files")
print(f"Latest file: {latest_file}")

# Step 5: Recreate the table
spark.sql(f"DROP TABLE IF EXISTS {table_name}")

spark.sql(f"""
    CREATE TABLE {table_name}
    USING csv
    OPTIONS (
        header 'true',
        ignoreLeadingWhiteSpace 'true',
        ignoreTrailingWhiteSpace 'true',
        inferSchema 'true'
    )
    LOCATION '{latest_file}'
""")

print(f"Table {table_name} created successfully!")

# Step 6: Verify by showing a preview
display(spark.sql(f"SELECT * FROM {table_name} LIMIT 5"))