Skip to content
imdb_movie_recommendation
Movie Recommendation Using NLP
import pandas as pd
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,desc"
# Response
response = requests.get(url, headers = header)
print(response.status_code)
soup = BeautifulSoup(response.content, 'html.parser')
titles = []
years = []
ratings = []
descriptions = []
movie_data = soup.findAll('li', attrs = {'class': 'ipc-metadata-list-summary-item'})
# Iterate over each movie item
for data in movie_data:
# Extract the title
title = data.find('h3', class_='ipc-title__text').text.split('.')[1].replace("'", "").strip()
titles.append(title)
# Extract the year
year = data.find(class_='dli-title-metadata').find_all('span')[0].text.strip()
years.append(year)
# Extract the rating
rating = data.find('span', class_='ipc-rating-star').text.split()[0].strip()
# Extract the rating from the span
rating = float(rating)
# Append the rating to the ratings list
ratings.append(rating)
# Extract the description
description = data.find(class_='ipc-html-content-inner-div').text.strip()
descriptions.append(description)
import pandas as pd
imdb_top100 = pd.DataFrame({'Title': titles,
'Year': years,
'Rating': ratings,
'Snopsis': descriptions})
imdb_top100.to_csv("imdb_top_100.csv", index = False)
df = pd.read_csv("imdb_top_100.csv")
df.head()
Preprocessing Using spaCy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.examples import sentences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity