Skip to content
projects
  • AI Chat
  • Code
  • Report
  • Spinner
    # Import beautifulsoup
    
    !pip install beautifulsoup4
    
    # Import requested libraries
    import numpy as np
    import pandas as pd # library for data analysis
    import requests # library to handle requests
    from bs4 import BeautifulSoup # library to parse HTML documents
    
    
    # get the response in the form of html
    wikiurl="https://en.wikipedia.org/wiki/List_of_cities_in_Ukraine#Table_of_cities"
    table_class="wikitable sortable jquery-tablesorter"
    response=requests.get(wikiurl)
    print(response.status_code)
    

    Request for the HTML response using the URL : We send a GET request to the Wikipedia URL whose table needs to be scraped and store the HTML response in a variable. It is not legal to scrape any website, so we check the status code. 200 shows that we can go ahead and download it.

    do dendogram

    # UNSUPERVISED LEARNING IN PYTHONHierarchical clustering with SciPy
    # Given samples (the array of scores), and country_names
    # import matplotlib.pyplot as plt
    # from scipy.cluster.hierarchy import linkage, dendrogram
    # mergings = linkage(samples, method='complete')
    # dendrogram(mergings,
    # labels=country_names,
    # leaf_rotation=90,
    # leaf_font_size=6)
    # plt.show()
    # parse data from the html into a beautifulsoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    uatable=soup.find('table',{'class':"wikitable"})
    df=pd.read_html(str(uatable))
    print(type(df))
    print(df[0])
    # convert list to dataframe
    df=pd.DataFrame(df[0])
    print(df.head())
    print(type(df))
    
    # replace_values = {'[a]':'','[b]':'','[c]':'','[d]':'',' ':'_'}
    df['City name']=df['City name'].str.replace(' ','_')
    #df['City name']=df['City name'].str.replace("[a]","") it is not replacing characters on the series?
    cities=df["City name"].tolist()
    
    
    
    
    
    # print(len(cities[4]))
    cities=[cities[i].replace("[a]","") for i in range(len(cities))]
    cities=[cities[i].replace("[b]","") for i in range(len(cities))]
    cities=[cities[i].replace("[c]","") for i in range(len(cities))]
    cities=[cities[i].replace("[d]","") for i in range(len(cities))]
    cities[0:10]
    df.shape
    df.info()
    print(df[df['Oblast']=='Ivano-Frankivsk'])

    https://www.projectpro.io/article/sql-database-projects-for-data-analysis-to-practice/565

    from bs4 import BeautifulSoup
    import requests
    wiki_url = 'https://en.wikipedia.org/wiki/List_of_cities_in_Ukraine#List_of_cities'
    print('Fetching main wiki article: %s' % wiki_url)
    page = requests.get(wiki_url).text
    print('Done. Extracting table links..')
    html = BeautifulSoup(page, 'html.parser')
    table = html.find('table', 'wikitable')
    
    
    
    links = table.findAll('a')
    links_content = {}
    list_of_links= []
    # for link in links:
    #   print(link)  
    
    for name in cities:
         #testing
         #print('https://en.wikipedia.org/wiki/'+link.string)
         #code to update dict 
      links_content.update({name:'https://en.wikipedia.org/wiki/'+name})
      list_of_links.append('https://en.wikipedia.org/wiki/'+name)  
    print(list_of_links)
    list_of_links_test = list_of_links[:10]
    city_data = []
    list_dataframes = []
    for link in list_of_links_test:
      #print(link)
      response1=requests.get(link)
      # parse data from the html into a beautifulsoup object
      soup = BeautifulSoup(response1.text, 'html.parser')
      citytable=soup.find('table',{'class':"infobox ib-settlement vcard"})
      cities=pd.read_html(str(citytable))
      city_data.append(cities[0])
    
    # # Transpose each DataFrame
    # transposed_dfs = [df.transpose() for df in city_data]
    
    for df in city_data:
        # Extract the header row
        header_row = df.iloc[0]
        # Reset the columns of the DataFrame to ensure the same number of elements as the header row
        df.columns = range(len(df.columns))
        # Remove the header row from the DataFrame
        df = df.iloc[1:]
        # Transpose the DataFrame
        transposed_df = df.transpose()
        # Set the header row as the column names of the transposed DataFrame
        transposed_df.columns = header_row
        # Reset the index
        transposed_df.reset_index(drop=True, inplace=True)
        # Append the transposed DataFrame to the list
        transposed_dfs.append(transposed_df)
    
    
    
    transposed_dfs[1]    
    
    # Now combined_df contains a single DataFrame with all the data from the individual city DataFrames
    
    
      # before appending need to delete first 7 rows, standarize and fix number of columns  , then transpose, then stack dataframes
        
      # here we have a list of dataframes where each element should be a dataframe  
    # for i in range(len(city_data)):
    #     city_data[i].columns = city_data[i].iloc[0]
    #     city_data[i] = city_data.iloc[1:]
    
    # #Step 1: identify common columns
    # common_columns = set(list_dataframes[0].columns)
    # for df in list_dataframes[1:]:
    #     common_columns &= set(df.columns)
    # common_columns
        
        
        
        
    #convert list(s) to dataframe
    # testdf=pd.concat(city_data, axis=1).iloc[7:]
    # testdf.reset_index(drop=True, inplace=True)
    # testdf
    # transpose=testdf.transpose()
    # transpose.columns=transpose.iloc[0]
    # #remove first row from DataFrame
    # transpose = transpose[1:]
    # transpose
    # print(len(city_data))
    # ukraine=pd.DataFrame(city_data)
    # ukraine.transpose()
    # data1=city_data[0].iloc[7:].transpose()
    # data1.columns=data1.iloc[0]
    # data1=data1[1:]
    # data1.reset_index(inplace=True)
    # data1 = data1.rename(columns = {'index':'City'})
    # data2=city_data[1].transpose()
    
    # #just get the coordinates
    # data1.iloc[:,1]=data1.iloc[:,1].str[-22:]
    
    # #print(data1.iloc[:,1])
    # # data1.columns
    # # testdf=data1
    # # data1.columns
    # new_colums=['City',
    #        'Coordinates',
    #        'Country', 'Municipality', 'Founded', 'Named for', 'City council',
    #        'Raions', 'Government', 'Mayor',
    #        'Area', 'City Area', 'Elevation',
    #        'Population',
    #        'Population_census', 'Rank', 'Density',
    #        'Metro', 'Demonym(s)', 'Gross Regional Product', 'PBI',
    #        'Per capita', 'Time zone', 'Summer (DST)', 'Postal code',
    #        'Area code', 'Vehicle registration plate', 'FIPS code', 'Website']
    # #rename columns manually
    # print(data1.columns)
    # #drop columns
    # #data1.drop(data1.columns[[2,3,5,6,8,10,13,19]],axis=1, inplace=True)
    # data1

    ok so i got the data we wanted from wikipedia but it is really untidy data, the dataframe looks like a diagonal matrix [10000 01000 00100 00010 00001]

    make it tidy

    1. each column is a variable
    2. each row is an observation, each row in this case is a city, thats clear

    so we need to define the variables and fix the lenght of it, how many columns, ok i think for this we can do it simple and then add more features

    cities table id name name_in_latin_characters city village Oblast_region year_founded population area distance_to_capital(coordinates)

    Run cancelled
    new_df=pd.DataFrame.from_dict(links_content, orient='index')
    new_df