Skip to content
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to extract title and text from a URL
def extract_title_and_text(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    footer = soup.find(['footer', 'p', 'a'], class_='tdm-descr')
    if footer:
        footer.decompose()
    # Extract the title
    title = soup.title.string if soup.title else ""
    # Extract the text (excluding any script and style tags)
    text = ''
    for p in soup.find_all('p'):
        if p in footer:
            continue
        else:
            text += ' '.join(p.get_text())  
    return title, text

# Read the Excel file into a pandas DataFrame
df = pd.read_excel('Input.xlsx')

# Extract URL data from a specific column
urls = df['URL']

# Initialize lists to store titles and texts
titles = []
texts = []

# Extract title and text for each URL
for url in urls:
    title, text = extract_title_and_text(url)
    titles.append(title)
    texts.append(text)

# Create a new DataFrame to store the extracted title and text
extracted_data = pd.DataFrame({'Title': titles, 'Text': texts})

# Save the extracted text to a text file
with open('URL_IDtest.txt', 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')

print("Extracted text saved to 'URL_IDtest.txt'")