Skip to content
0
#Importing all Important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import gensim
import nltk
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import seaborn as sns
from plotly import graph_objs as go
from wordcloud import WordCloud
import pickle
import warnings
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
comments_df = pd.read_csv("comments.csv")
comments_df.info()
videos_df = pd.read_csv("videos_stats.csv")
videos_df.info()
# Display the first few rows of each dataframe
print("Videos DataFrame:")
print(videos_df.head())

print("\nComments DataFrame:")
print(comments_df.head())
# Check for missing values
print("Missing values in videos dataset:")
print(videos_df.isnull().sum())

print("\nMissing values in comments dataset:")
print(comments_df.isnull().sum())

# Check for duplicates
print("\nDuplicate rows in videos dataset:", videos_df.duplicated().sum())
print("Duplicate rows in comments dataset:", comments_df.duplicated().sum())

# Drop duplicates if any
videos_df.drop_duplicates(inplace=True)
comments_df.drop_duplicates(inplace=True)

videos_df.dropna(inplace=True)
comments_df.dropna(inplace=True)
import matplotlib.pyplot as plt
import seaborn as sns

# General statistics
print("Videos DataFrame Statistics:")
print(videos_df.describe())

merged_df = pd.merge(comments_df, videos_df, on='Video ID', how='left')
merged_df
merged_df.isna().sum()/100
rows_with_nan = merged_df[merged_df.isna().any(axis=1)]
rows_with_nan
Df_without_nan = merged_df[~merged_df[['Title', 'Published At', 'Keyword', 'Likes_y', 'Comments', 'Views']].isna().any(axis=1)]
Df_without_nan.replace(['Likes_x',"Likes_y"],["Videos_Likes","Comments_like"])