FracFocus — DataLab

import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.cluster import DBSCAN
import folium
import matplotlib.pyplot as plt
import seaborn as sns

pd.read_csv()

import zipfile
import os
import pandas as pd

# Specify the path to the zip file
zip_path = "./FracFocusCSV.zip"

# Create a list to store the dataframes
dfs = []

# Extract all files from the zip archive
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall()

# Iterate over all files in the extracted directory
for file_name in os.listdir():
    if file_name.endswith(".csv"):
        # Read each CSV file into a dataframe
        df = pd.read_csv(file_name)
        # Append the dataframe to the list
        dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# Convert 'JobStartDate' column to datetime
combined_df['JobStartDate'] = pd.to_datetime(combined_df['JobStartDate'], errors='coerce')

# Filter out rows with out-of-bounds dates
combined_df = combined_df[combined_df['JobStartDate'].notnull()]

# Extract the year from 'JobStartDate' column
combined_df['Jobstart_year'] = combined_df['JobStartDate'].dt.year

# Filter out rows with 'Jobstart_year' less than or equal to 2016
combined_df = combined_df[combined_df['Jobstart_year'] > 2014]

# Drop duplicate rows based on 'APINumber' column
combined_df = combined_df.drop_duplicates('APINumber')

# Print the combined dataframe
print(combined_df)

#Creates a clusters table using DBSCAN with the Haversine method.
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.cluster import DBSCAN
from shapely.geometry import Point

data1 = combined_df.copy()
data1.index = combined_df.APINumber
data1.index.name=''
data1 = data1.query("Jobstart_year > 2016")
data1 = data1.drop_duplicates(subset=['Latitude'])
data1 = data1.drop_duplicates(subset=['Longitude'])
# Extract latitude and longitude
lat_long = data1[['Latitude', 'Longitude']].dropna()

# Convert the latitude and longitude from degrees to radians
lat_long_radians = np.radians(lat_long[['Latitude', 'Longitude']].values)

# The Earth's radius in feet
EARTH_RADIUS = 20925524.9  # in feet
# Convert the desired distance (50 feet) to radians
epsilon_radians = 100 / EARTH_RADIUS

# Apply DBSCAN with Haversine metric
dbscan_haversine = DBSCAN(eps=epsilon_radians, min_samples=2, metric='haversine', algorithm='ball_tree')
lat_long['Cluster_Haversine'] = dbscan_haversine.fit_predict(lat_long_radians)

lat_long['Number_of_Wells'] = lat_long.groupby('Cluster_Haversine')['Cluster_Haversine'].transform('size')
lat_long['API']=lat_long.index

# Convert DataFrame to GeoDataFrame
geometry = [Point(xy) for xy in zip(lat_long['Longitude'], lat_long['Latitude'])]
lat_long['Number_of_Wells']=np.where(lat_long['Number_of_Wells']>1000,1,lat_long['Number_of_Wells']) # made to be 1
geo_df = gpd.GeoDataFrame(lat_long, geometry=geometry, crs="EPSG:4326")

# Save GeoDataFrame to a shapefile

geo_df.to_file('FracFocus/031023FF_since2019.shp', driver='ESRI Shapefile')
#lat_long.Number_of_Wells.value_counts().sort_index()
lat_long['Number_of_Wells']=np.where(lat_long['Number_of_Wells']>1000,1,lat_long['Number_of_Wells'])
#print(lat_long.Number_of_Wells.value_counts()) #shows the count for number of wells per pad

#Finding the Centers of Clusters

cluster_centers = lat_long.groupby('Cluster_Haversine')[['Latitude', 'Longitude','Number_of_Wells']].mean().reset_index()
geometry_centers = [Point(xy) for xy in zip(cluster_centers['Longitude'], cluster_centers['Latitude'])]
geo_df_centers = gpd.GeoDataFrame(cluster_centers, geometry=geometry_centers, crs="EPSG:4326")
#geo_df_centers.to_file('FracFocus/centers_output.shp', driver ='ESRI Shapefile') #Creates the shapefile to see clusters as average point

def check_cluster_haversine(row):
    if row['Cluster_Haversine'] == -1:
        return 'Single'
    else:
        return 'Multi'

lat_long['Single_Well'] = lat_long.apply(check_cluster_haversine, axis=1)

def update_cluster_haversine(row):
    if row['Cluster_Haversine'] == -1:
        return str(row['API'])[5:11]
    else:
        return row['Cluster_Haversine']

lat_long['id'] = lat_long.apply(update_cluster_haversine, axis=1)

lat_long.head()

x=[]
x = pd.merge(left=lat_long, left_index=True, right = data1, right_on='APINumber',suffixes=('','_right')) #joins original FF data with our DBSCAN data
x = x[['Latitude','Longitude','Cluster_Haversine','Number_of_Wells','JobStartDate','JobEndDate','StateNumber','CountyNumber','StateName','OperatorName','id','Single_Well','API']]

# Create a new column 'Number_of_Wells_Bins' in dataframe x
x['Number_of_Wells_Bins'] = pd.cut(x['Number_of_Wells'], bins=[0,1,2,3,4,5,6,10,16,30, float('inf')], labels=['1','2', '3', '4','5','6','10','16','30','100']) #Erroneous was created for values that exceed 1000 due to the labelling of DBSCAN, does not need to be graphed because values were forced to be 1
x['Number_of_Wells_Bins'].fillna('1', inplace=True)
x['JobEndDate'] = pd.to_datetime(x['JobEndDate'])
x['wDays'] = (x['JobEndDate'] - x['JobStartDate']).dt.days
x['StateNumber']=x['StateNumber'].astype('int')
x['CountyNumber']=x['CountyNumber'].astype('int')
x['FIPS'] = x['StateNumber'].astype('str').str.zfill(2) + x['CountyNumber'].astype('str').str.zfill(2)
x['FIPS']=x['FIPS'].astype('int')

x.drop(x[x['Longitude'] < -180].index, inplace=True)
x.drop(x[x['Longitude'] > 180].index, inplace=True)

x.drop(x[x['Latitude'] < -90].index, inplace=True)
x.drop(x[x['Latitude'] > 90].index, inplace=True)

import MHB as mhb
x['MHB'] = x.apply(mhb.gridCalc, axis=1)

# Merge DPR Regions
DPR = pd.read_excel('DPR_FIPS_forFF.xlsx', sheet_name='RegionCounties', header=0)
x=pd.merge(left=x, left_on='FIPS', right=DPR[['FIPS','Region','dprCode','County']], right_on='FIPS', how ='outer')
x['Region'] = x['Region'].fillna("Non-DPR")
x.dprCode.fillna(0,inplace=True)

# Grouping by Cluster_Haversine and finding the earliest JobStartDate and latest JobEndDate
Haversine_pad_min_max = x.groupby(['Cluster_Haversine']).agg({'JobStartDate': 'min', 'JobEndDate': 'max'})
# Calculating the difference between JobEndDate and JobStartDate
Haversine_pad_min_max['Number_days_per_pad'] = (Haversine_pad_min_max['JobEndDate'] - Haversine_pad_min_max['JobStartDate']).dt.days
Haversine_pad_min_max=Haversine_pad_min_max.reset_index()
#Merge the number of days to complete pad to overall dataframe
x = pd.merge(left = Haversine_pad_min_max[['Number_days_per_pad','Cluster_Haversine']], left_on='Cluster_Haversine', right =x, right_on='Cluster_Haversine', how='outer')
x.dropna(inplace=True) #Anadarko counties with no completions
x.info() #confirms MHB works

#Grouping Pads by Operator Name within MHB and ranking the Pads in order by JobStartDate 

x['Date_Rank']=x.groupby(['MHB','OperatorName','id'])['JobStartDate'].rank(method='first', ascending=True)
pd.DataFrame(x.loc[x.OperatorName=='Upland Exploration LLC'])

x.id=x['id'].astype('int')
y = x.copy()
# Create Pad_Date_Rank which is the order of a 
y['Pad_Date_Rank'] = x.sort_values('JobStartDate').groupby(['id','OperatorName','Region'])['id'].rank(method='first', ascending=True).to_frame()

y = y.sort_values('id', ascending=False)
y.to_csv('a.csv')

y['Well_Date_Rank'] = x.sort_values('JobStartDate').groupby(['OperatorName','MHB'])['id'].apply(lambda x: x.rank(method='first', ascending=True)).to_frame()

max_date_rank_per_region = y.groupby('Region')[['Well_Date_Rank','Pad_Date_Rank']].max()
max_date_rank_per_region['Pad_Date_Rank'] = max_date_rank_per_region['Pad_Date_Rank'].astype(int)
max_date_rank_per_region

# Assuming the data is loaded into a DataFrame named 'y'
# y = pd.read_csv("/path/to/your/data.csv")

# Remove all spaces within the OperatorName column and trim any leading or trailing whitespace
y['OperatorName_NoSpaces'] = y['OperatorName'].str.replace(' ', '').str.strip()

# Ensure the relevant columns are in the right format
y['dprCode'] = y['dprCode'].astype(int).astype(str).str.zfill(2)
y['CountyNumber'] = y['CountyNumber'].astype(int).astype(str).str.zfill(3)
y['MHB_last_2'] = y['MHB'].str[-2:]
y['Pad_Date_Rank'] = y['Pad_Date_Rank'].astype(int).astype(str).str.zfill(3)

# Generate the ID with dashes as delimiters based on the updated format
#y['PAD_Extended_ID'] = (y['dprCode'] + '-' + y['CountyNumber'] + '-' + y['MHB_last_2'] + '-' + y['OperatorName_NoSpaces'] + '-' + y['Single_Well'] + '-' + y['Pad_Date_Rank'])

y['PAD_Extended_ID'] = (y['dprCode'] + '-' + y['CountyNumber'] + '-' + y['MHB_last_2'] + '-' + y['OperatorName_NoSpaces'] + '-' + y['Single_Well'] + '-' + y['Pad_Date_Rank'])
pd.DataFrame(y.loc[y.Single_Well=='n']['PAD_Extended_ID'])

#1/2/24 idea for making num_wells rather than single vs multi
y['num_wells'] = y['Number_of_Wells_Bins'].astype(str)
y['PAD_Extended_ID'] = (y['dprCode'] + '-' + y['CountyNumber'] + '-' + y['MHB_last_2'] + '-' + y['OperatorName_NoSpaces'] + '-' + y['num_wells'] + '-' + y['Pad_Date_Rank'])
m=y.query("num_wells!='1'")
m[["Date_Rank","Pad_Date_Rank","PAD_Extended_ID"]]

y.to_csv('FF_data.csv')

pivot_table = y.pivot_table(index='OperatorName_NoSpaces', columns='Region', values='PAD_Extended_ID', aggfunc='count')
pivot_table.to_csv('companyby.csv')
pivot_table

# Filter y for dprCode = 42
y_filtered = y[y['dprCode'] == '42']

# Pivot table for y_filtered with JobStartDate by period quarter index and columns for Single_Well counts
pivot_table = pd.pivot_table(y_filtered, values='API', index=y_filtered['JobStartDate'].dt.to_period('Q'), columns='Single_Well', aggfunc='count')

# Reorder the columns to have Single on the bottom and Multi on top
pivot_table = pivot_table[['Single', 'Multi']]

# Stacked bar chart
ax = pivot_table.plot(kind='bar', stacked=True)
plt.title('Single well versus Multi-Well Completions - Permian Region')
plt.ylabel('Count of Well Sites')
plt.legend(bbox_to_anchor=(1, 1), title='')
plt.show()

‌
‌
‌