Skip to content
# Start coding here...
!pip install dbt-duckdb
!dbt --version
import duckdb

df = duckdb.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

df.head()
Spinner
DataFrameas
df
variable
SELECT * FROM 'yellow_tripdata_2023-01.parquet'
Spinner
DataFrameas
df1
variable
select 
    '2023' as year,
	datepart('month', tpep_pickup_datetime) as month,
    VendorID, 
	avg(passenger_count) as average_passengers
from 'yellow_tripdata_2023-01.parquet'
where datepart('year', tpep_pickup_datetime) = 2023
group by year, month, VendorID
order by year, month
Spinner
DataFrameas
df2
variable
Run cancelled
select 
    '2023' as year,
	datepart('month', tpep_pickup_datetime) as month,
    VendorID, 
	count(*) as total_rides,
	avg(passenger_count) as average_passengers
from 'yellow_tripdata_2023-01.parquet'
where datepart('year', tpep_pickup_datetime) = 2023
group by year, month, VendorID
Spinner
DataFrameas
df3
variable
select
  extract (month from cal.month) as month,
  extract( days from ((cal.month + interval '1 month') - cal.month)) as length
from
  (select unnest(range(timestamp '2012-01-01', timestamp '2012-12-01', interval '1 month')) as month
  ) cal
order by month;
df3.dtypes
Spinner
DataFrameas
df4
variable
select
  regexp_replace ('abc', '(b|c)', 'X') AS aXc, -- aXc 
  regexp_replace ('abc', '(b|c)', '\\1\\1\\1\\1') AS abbbc, -- abbbc
  regexp_replace ('abc', '(.*)c', '\\1e') AS abe, -- abe
  regexp_replace ('abc', '(a)(b)', '\\2\\1') AS bac -- bac
import pandas as pd
 
# create pandas DataFrame with one column with five
# datetime values through a dictionary
df = pd.DataFrame({'DateTime': ['2021-01-15 20:02:11',
                                '1989-05-24 20:34:11',
                                '2020-01-18 14:43:24',
                                '2021-01-15 20:02:10',
                                '1999-04-04 20:34:11']})
 
df2 = pd.DataFrame({'DateTime': [20080827,
                                 20080907,
                                 20080910,
                                 20081119,
                                 20081212]})
print("Original data")
print(df)
print(df2)

# convert datetime column to just date
df2['Date'] = pd.to_datetime(df2['DateTime'].astype(str), format='%Y%m%d').dt.date.astype(str)
df2['DateMix'] = df2['DateTime'].apply(lambda x: str(x)[0:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8])
df2['Date3'] = df2['DateTime'].apply(lambda x: f'{str(x)[0:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
#df2['Date'] = pd.to_datetime(str(df2['DateTime']), format='%Y%m%d') #.dt.date
 
# display
print("Only date")
print(df2)
#df2['Date'] = df2['Date'].astype(str)
df2
import pandas as pd
df = pd.DataFrame({'Content': [
                               "Filename$",
                               "Filename%",
                               "File$%name",
                               "Filename"]})

print(df)

df.replace(to_replace=['\$', "%"], value="", inplace=True, regex=True)

print(df)
Spinner
DataFrameas
df5
variable
create table t as select * from read_parquet('yellow_tripdata_2023-01.parquet')
Spinner
DataFrameas
df6
variable
Spinner
DataFrameas
movies
variable
create table m as SELECT * FROM 'movies.csv'
Spinner
DataFrameas
df8
variable
select