Skip to content
imdb_movie_recommendation

Movie Recommendation Using NLP

import pandas as pd
import requests
from bs4 import BeautifulSoup
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,desc"
# Response
response = requests.get(url, headers = header)
print(response.status_code)
soup = BeautifulSoup(response.content, 'html.parser')
titles = []
years = []
ratings = []
descriptions = []
movie_data = soup.findAll('li', attrs = {'class': 'ipc-metadata-list-summary-item'})
# Iterate over each movie item
for data in movie_data:
    
    # Extract the title
    title = data.find('h3', class_='ipc-title__text').text.split('.')[1].replace("'", "").strip()
    titles.append(title)
    
    # Extract the year
    year = data.find(class_='dli-title-metadata').find_all('span')[0].text.strip()
    years.append(year)  
    
    # Extract the rating
    rating = data.find('span', class_='ipc-rating-star').text.split()[0].strip()
    
    # Extract the rating from the span
    rating = float(rating)
    
    # Append the rating to the ratings list
    ratings.append(rating)
    
    # Extract the description
    description = data.find(class_='ipc-html-content-inner-div').text.strip()
    descriptions.append(description)

import pandas as pd
imdb_top100 = pd.DataFrame({'Title': titles,
                           'Year': years,
                           'Rating': ratings,
                           'Snopsis': descriptions})
imdb_top100.to_csv("imdb_top_100.csv", index = False)
df = pd.read_csv("imdb_top_100.csv")

df.head()

Preprocessing Using spaCy

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.examples import sentences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity