Skip to content
# Start coding here...
!pip install dbt-duckdb
!dbt --version
import duckdb
df = duckdb.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df.head()
Unknown integration
DataFrameavailable as
df
variable
SELECT * FROM 'yellow_tripdata_2023-01.parquet'
Unknown integration
DataFrameavailable as
df1
variable
select
'2023' as year,
datepart('month', tpep_pickup_datetime) as month,
VendorID,
avg(passenger_count) as average_passengers
from 'yellow_tripdata_2023-01.parquet'
where datepart('year', tpep_pickup_datetime) = 2023
group by year, month, VendorID
order by year, month
Unknown integration
DataFrameavailable as
df2
variable
Run cancelled
select
'2023' as year,
datepart('month', tpep_pickup_datetime) as month,
VendorID,
count(*) as total_rides,
avg(passenger_count) as average_passengers
from 'yellow_tripdata_2023-01.parquet'
where datepart('year', tpep_pickup_datetime) = 2023
group by year, month, VendorID
Unknown integration
DataFrameavailable as
df3
variable
select
extract (month from cal.month) as month,
extract( days from ((cal.month + interval '1 month') - cal.month)) as length
from
(select unnest(range(timestamp '2012-01-01', timestamp '2012-12-01', interval '1 month')) as month
) cal
order by month;
df3.dtypes
Unknown integration
DataFrameavailable as
df4
variable
select
regexp_replace ('abc', '(b|c)', 'X') AS aXc, -- aXc
regexp_replace ('abc', '(b|c)', '\\1\\1\\1\\1') AS abbbc, -- abbbc
regexp_replace ('abc', '(.*)c', '\\1e') AS abe, -- abe
regexp_replace ('abc', '(a)(b)', '\\2\\1') AS bac -- bac
import pandas as pd
# create pandas DataFrame with one column with five
# datetime values through a dictionary
df = pd.DataFrame({'DateTime': ['2021-01-15 20:02:11',
'1989-05-24 20:34:11',
'2020-01-18 14:43:24',
'2021-01-15 20:02:10',
'1999-04-04 20:34:11']})
df2 = pd.DataFrame({'DateTime': [20080827,
20080907,
20080910,
20081119,
20081212]})
print("Original data")
print(df)
print(df2)
# convert datetime column to just date
df2['Date'] = pd.to_datetime(df2['DateTime'].astype(str), format='%Y%m%d').dt.date.astype(str)
df2['DateMix'] = df2['DateTime'].apply(lambda x: str(x)[0:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8])
df2['Date3'] = df2['DateTime'].apply(lambda x: f'{str(x)[0:4]}-{str(x)[4:6]}-{str(x)[6:8]}')
#df2['Date'] = pd.to_datetime(str(df2['DateTime']), format='%Y%m%d') #.dt.date
# display
print("Only date")
print(df2)
#df2['Date'] = df2['Date'].astype(str)
df2
import pandas as pd
df = pd.DataFrame({'Content': [
"Filename$",
"Filename%",
"File$%name",
"Filename"]})
print(df)
df.replace(to_replace=['\$', "%"], value="", inplace=True, regex=True)
print(df)
Unknown integration
DataFrameavailable as
df5
variable
create table t as select * from read_parquet('yellow_tripdata_2023-01.parquet')
Unknown integration
DataFrameavailable as
df6
variable
Unknown integration
DataFrameavailable as
movies
variable
create table m as SELECT * FROM 'movies.csv'
Unknown integration
DataFrameavailable as
df8
variable
select