Skip to content
Web Scraping with beautiful soup for research analysis
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Function to extract title and text from a URL
def extract_title_and_text(url):
# Send a GET request to the URL
response = requests.get(url)
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
footer = soup.find(['footer', 'p', 'a'], class_='tdm-descr')
if footer:
footer.decompose()
# Extract the title
title = soup.title.string if soup.title else ""
# Extract the text (excluding any script and style tags)
text = ''
for p in soup.find_all('p'):
if p in footer:
continue
else:
text += ' '.join(p.get_text())
return title, text
# Read the Excel file into a pandas DataFrame
df = pd.read_excel('Input.xlsx')
# Extract URL data from a specific column
urls = df['URL']
# Initialize lists to store titles and texts
titles = []
texts = []
# Extract title and text for each URL
for url in urls:
title, text = extract_title_and_text(url)
titles.append(title)
texts.append(text)
# Create a new DataFrame to store the extracted title and text
extracted_data = pd.DataFrame({'Title': titles, 'Text': texts})
# Save the extracted text to a text file
with open('URL_IDtest.txt', 'w', encoding='utf-8') as file:
for text in texts:
file.write(text + '\n')
print("Extracted text saved to 'URL_IDtest.txt'")