import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from datetime import datetime

# Load the data
df = pd.read_csv('../data/netflix/netflix.csv')

# Convert 'Premiere' to datetime format
df['premiere'] = pd.to_datetime(df['premiere'])

# Check initial data
df.head()

# Check for missing values
print(df.isnull().sum())

# Handling missing values (if any)
df.dropna(inplace=True)  # or other methods depending on the context

# Removing duplicates
df.drop_duplicates(inplace=True)

# Confirm changes
df.info()

title         0
genre         0
language      0
imdb_score    0
premiere      0
runtime       0
year          0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   title       583 non-null    object
 1   genre       583 non-null    object
 2   language    583 non-null    object
 3   imdb_score  583 non-null    float64
 4   premiere    583 non-null    datetime64[ns]
 5   runtime     583 non-null    int64
 6   year        583 non-null    int64
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 32.0+ KB

# Descriptive statistics for numeric columns
print(df.describe())

# Frequency of categories in 'Genre' and 'Language'
print(df['genre'].value_counts())
print(df['language'].value_counts())

# Distribution of IMDb Scores
sns.histplot(df['imdb_score'], kde=True)
plt.title('Distribution of IMDb Scores')
plt.xlabel('IMDb Score')
plt.ylabel('Frequency')
plt.show()

       imdb_score                       premiere     runtime         year
count  583.000000                            583  583.000000   583.000000
mean     6.275129  2019-06-19 17:02:34.373927936   93.490566  2018.934820
min      2.500000            2014-12-13 00:00:00    4.000000  2014.000000
25%      5.700000            2018-06-26 12:00:00   86.000000  2018.000000
50%      6.400000            2019-10-16 00:00:00   97.000000  2019.000000
75%      7.000000            2020-09-19 12:00:00  107.500000  2020.000000
max      9.000000            2021-05-27 00:00:00  209.000000  2021.000000
std      0.976678                            NaN   27.706665     1.474598
genre
Documentary                    159
Drama                           77
Comedy                          49
Romantic comedy                 39
Thriller                        33
                              ...
Political thriller               1
Fantasy                          1
Romantic comedy-drama            1
Animation/Musical/Adventure      1
Supernatural drama               1
Name: count, Length: 114, dtype: int64
language
English                       401
Hindi                          32
Spanish                        31
French                         20
Italian                        14
Portuguese                     12
Indonesian                      9
Korean                          6
Japanese                        6
English/Spanish                 5
German                          5
Turkish                         5
Polish                          3
Dutch                           3
Marathi                         3
Filipino                        2
Thai                            2
English/Japanese                2
English/Hindi                   2
English/Mandarin                2
English/Korean                  1
Khmer/English/French            1
English/Akan                    1
Bengali                         1
English/Swedish                 1
English/Arabic                  1
English/Taiwanese/Mandarin      1
Norwegian                       1
Tamil                           1
English/Ukranian/Russian        1
Spanish/Catalan                 1
English/Russian                 1
Georgian                        1
Spanish/English                 1
Swedish                         1
Malay                           1
Thia/English                    1
Spanish/Basque                  1
Name: count, dtype: int64

# Correlation matrix
corr_matrix = df[['imdb_score', 'runtime', 'year']].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Trends in IMDb Score over years
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='year', y='imdb_score', marker='o')
plt.title('Trend of IMDb Scores Over Years')
plt.xlabel('Year')
plt.ylabel('Average IMDb Score')
plt.grid(True)
plt.show()

# Trends in Runtime over years
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='year', y='runtime', marker='o', color='red')
plt.title('Trend of Runtime Over Years')
plt.xlabel('Year')
plt.ylabel('Average Runtime (minutes)')
plt.grid(True)
plt.show()

# Assuming the new dataset is loaded into DataFrame named 'new_df' and it includes 'Title', 'Type', and 'Rating' columns
new_df = pd.read_csv('../data/netflix/netflix_titles.csv')

# Merging the original dataset 'df' with 'new_df'
# We are using a left join to keep all entries from the original dataset and only add matching entries from 'new_df'
merged_df = pd.merge(df, new_df[['title', 'type', 'rating']], on='title', how='left')

# Check the first few rows and info to confirm the merge
print(merged_df.head())
merged_df.info()

               title            genre language  imdb_score   premiere  \
0   Notes for My Son            Drama  Spanish         6.3 2020-11-24
1   To Each, Her Own  Romantic comedy   French         5.3 2018-06-24
2      The Lovebirds  Romantic comedy  English         6.1 2020-05-22
3     The Perfection  Horror-thriller  English         6.1 2019-05-24
4  Happy Anniversary  Romantic comedy  English         5.8 2018-03-30

   runtime  year   type rating
0       83  2020  Movie  TV-MA
1       95  2018  Movie  TV-MA
2       87  2020  Movie      R
3       90  2019  Movie  TV-MA
4       78  2018  Movie  TV-MA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   title       583 non-null    object
 1   genre       583 non-null    object
 2   language    583 non-null    object
 3   imdb_score  583 non-null    float64
 4   premiere    583 non-null    datetime64[ns]
 5   runtime     583 non-null    int64
 6   year        583 non-null    int64
 7   type        505 non-null    object
 8   rating      505 non-null    object
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 41.1+ KB

# Identifying NaN values
print(merged_df.isnull().sum())

# Removing NaN values from 'IMDb Score'
merged_df.dropna(subset=['imdb_score'], inplace=True)

# Verifying the removal of NaN values
print(merged_df.isnull().sum())

# Checking the number of data points for each group
movies_count = merged_df[merged_df['type'] == 'Movie'].shape[0]
tv_shows_count = merged_df[merged_df['type'] == 'TV Show'].shape[0]

print(f"Number of movies with scores: {movies_count}")
print(f"Number of TV shows with scores: {tv_shows_count}")

# Ensure both groups have sufficient data points
if movies_count > 30 and tv_shows_count > 30:
    print("Both groups have sufficient data points for analysis.")
else:
    print("One or both groups do not have sufficient data points for reliable statistical analysis.")

title          0
genre          0
language       0
imdb_score     0
premiere       0
runtime        0
year           0
type          78
rating        78
dtype: int64
title          0
genre          0
language       0
imdb_score     0
premiere       0
runtime        0
year           0
type          78
rating        78
dtype: int64
Number of movies with scores: 505
Number of TV shows with scores: 0
One or both groups do not have sufficient data points for reliable statistical analysis.

# Ensure the 'Rating' column is not missing any values
merged_df['rating'].dropna(inplace=True)

# Check the number of movies in each rating category
rating_counts = merged_df['rating'].value_counts()
print(rating_counts)

# Filtering data for the most common ratings to ensure sufficient sample size
common_ratings = rating_counts[rating_counts > 30].index  # Filter ratings with more than 30 movies
filtered_data = merged_df[merged_df['rating'].isin(common_ratings)]

# Performing ANOVA
anova_result = stats.f_oneway(
    *[filtered_data[filtered_data['rating'] == rating]['imdb_score'] for rating in common_ratings]
)

print(f"F-statistic: {anova_result.statistic:.2f}")
print(f"P-value: {anova_result.pvalue:.4f}")

# Interpretation
if anova_result.pvalue < 0.05:
    print("We reject the null hypothesis: There is a significant difference in IMDb Scores across movie ratings.")
else:
    print("We fail to reject the null hypothesis: There is no significant difference in IMDb Scores across movie ratings.")

rating
TV-MA    253
TV-14     91
TV-PG     55
R         47
PG-13     23
TV-G      15
PG        11
TV-Y       5
TV-Y7      5
Name: count, dtype: int64
F-statistic: 2.42
P-value: 0.0654
We fail to reject the null hypothesis: There is no significant difference in IMDb Scores across movie ratings.

# Overall Distribution of IMDb Scores
plt.figure(figsize=(10, 5))
sns.histplot(merged_df['imdb_score'], kde=True, color='blue')
plt.title('Distribution of IMDb Scores')
plt.xlabel('IMDb Score')
plt.ylabel('Frequency')
plt.show()

# Boxplot to Show Distribution Across Ratings
plt.figure(figsize=(12, 6))
sns.boxplot(x='rating', y='imdb_score', data=merged_df)
plt.title('IMDb Scores by Movie Rating')
plt.xlabel('Movie Rating')
plt.ylabel('IMDb Score')
plt.show()

# Detailed Distribution for Each Rating
ratings = merged_df['rating'].unique()
for rating in ratings:
    plt.figure(figsize=(10, 5))
    sns.histplot(merged_df[merged_df['rating'] == rating]['imdb_score'], kde=True)
    plt.title(f'Distribution of IMDb Scores for {rating} Rated Movies')
    plt.xlabel('IMDb Score')
    plt.ylabel('Frequency')
    plt.show()

import statsmodels.api as sm

# Preparing data for regression analysis
# Assuming 'Rating' is the column in your dataframe with categorical data
# Create dummy variables for the 'Rating' column and add prefix to make them identifiable
rating_dummies = pd.get_dummies(merged_df['rating'])

# If you have other categorical variables, you can convert them in a similar way
# For now, let's assume 'Runtime' is another predictor
X = pd.concat([merged_df[['runtime']], rating_dummies], axis=1)
variable_names = ['const'] + list(X.columns)
X = np.array(X.astype(int))

# Adding a constant for the intercept term
X = sm.add_constant(X)

# Response variable
Y = merged_df[['imdb_score']]
Y = np.array(Y.astype(float))

# Building the model
model = sm.OLS(Y, X).fit()

print(model.summary(xname=variable_names))

                            OLS Regression Results
==============================================================================
Dep. Variable:                      y   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     3.631
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           0.000105
Time:                        12:12:39   Log-Likelihood:                -795.04
No. Observations:                 583   AIC:                             1612.
Df Residuals:                     572   BIC:                             1660.
Df Model:                          10
Covariance Type:            nonrobust
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.7706      0.170     39.835      0.000       6.437       7.104
runtime       -0.0003      0.002     -0.215      0.830      -0.003       0.003
PG             0.0600      0.308      0.195      0.846      -0.545       0.665
PG-13         -0.3489      0.229     -1.526      0.128      -0.798       0.100
R             -0.2344      0.185     -1.265      0.206      -0.598       0.130
TV-14         -0.6032      0.149     -4.061      0.000      -0.895      -0.312
TV-G          -0.5237      0.269     -1.944      0.052      -1.053       0.005
TV-MA         -0.6323      0.126     -5.015      0.000      -0.880      -0.385
TV-PG         -0.4857      0.169     -2.875      0.004      -0.817      -0.154
TV-Y           0.0257      0.444      0.058      0.954      -0.847       0.898
TV-Y7         -0.5041      0.441     -1.144      0.253      -1.370       0.362
==============================================================================
Omnibus:                       29.696   Durbin-Watson:                   2.060
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               34.248
Skew:                          -0.514   Prob(JB):                     3.66e-08
Kurtosis:                       3.594   Cond. No.                     1.18e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.18e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

from sklearn.cluster import KMeans
import numpy as np

# Standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = ['imdb_score', 'runtime']
X_scaled = scaler.fit_transform(merged_df[features])

# Running KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_scaled)
clusters = kmeans.labels_

# Adding cluster labels to the original data
merged_df['Cluster'] = clusters

# Plotting clusters
sns.scatterplot(x='runtime', y='imdb_score', hue='Cluster', data=merged_df, palette='viridis')
plt.title('Cluster of Movies by Runtime and IMDb Score')
plt.show()

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Assuming X is your dataset and it's already scaled
# Let's create a range of values for k
ks = range(1, 11)
inertias = []

for k in ks:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    inertias.append(model.inertia_)

# Plotting the elbow curve
plt.figure(figsize=(8, 4))
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.title('Elbow Method For Optimal k')
plt.xticks(ks)
plt.show()

from sklearn.metrics import silhouette_score

# Again, assuming X is your scaled dataset
silhouette_scores = []

ks = range(2, 11)
# This loop assumes you've already defined 'ks' as a range of cluster counts
for k in ks:
    model = KMeans(n_clusters=k, random_state=42)
    labels = model.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)

# Plotting the silhouette scores
plt.figure(figsize=(8, 4))
plt.plot(ks, silhouette_scores, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('silhouette score')
plt.title('Silhouette Method For Optimal k')
plt.xticks(ks)
plt.show()

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Assuming X is your dataset and it's already scaled
ks = range(2, 6)  # Testing from 2 up to 5 clusters
silhouette_scores = []

for k in ks:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)

# Plotting silhouette scores for different cluster counts
plt.figure(figsize=(8, 4))
plt.plot(ks, silhouette_scores, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores by Number of Clusters')
plt.xticks(ks)
plt.show()

# Running KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=4, random_state=0).fit(X_scaled)
clusters = kmeans.labels_

# Adding cluster labels to the original data
merged_df['Cluster'] = clusters

# Plotting clusters
sns.scatterplot(x='runtime', y='imdb_score', hue='Cluster', data=merged_df, palette='viridis')
plt.title('Cluster of Movies by Runtime and IMDb Score')
plt.show()

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Assuming 'X' is the dataset containing features to be transformed
# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA
pca = PCA(n_components=2)  # Reduce data into 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Plotting the PCA-transformed version of the data
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Dataset')
plt.grid(True)
plt.show()

Explained variance ratio: [0.15441856 0.13754093]

# Create a DataFrame for the PCA results
pca_scores_df = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])

# Save the PCA scores to a CSV file
# pca_scores_df.to_csv('../data/netflix/pca_scores.csv', index=False)

# Save the explained variance to a text file
# explained_variance = pca.explained_variance_ratio_
# with open('../data/netflix/explained_variance.txt', 'w') as file:
    # file.write('Explained variance by component: {}\n'.format(explained_variance))

# Provide the paths to the saved files
# print("PCA scores saved to: pca_scores.csv")
# print("Explained variance saved to: explained_variance.txt")

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

pca_scores_df = pd.DataFrame(data=X_pca, columns=['Principal Component 1', 'Principal Component 2'])

# Determining the silhouette score for different numbers of clusters
range_n_clusters = [2, 3, 4, 5, 6]
silhouette_avg_scores = []

for n_clusters in range_n_clusters:
    # Initialize KMeans with the current number of clusters and fit to PCA scores
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(pca_scores_df)

    # Calculate silhouette score and append to list
    silhouette_avg = silhouette_score(pca_scores_df, cluster_labels)
    silhouette_avg_scores.append(silhouette_avg)
    print(f"For n_clusters = {n_clusters}, the average silhouette_score is: {silhouette_avg:.4f}")

# Plotting the silhouette scores
plt.figure(figsize=(8, 6))
plt.plot(range_n_clusters, silhouette_avg_scores, marker='o')
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Average Silhouette Score')
plt.xticks(range_n_clusters)
plt.grid(True)
plt.show()

# Using the silhouette score, choose the optimal number of clusters and perform final clustering
optimal_n_clusters = range_n_clusters[silhouette_avg_scores.index(max(silhouette_avg_scores))]
print(f"Optimal number of clusters based on silhouette score: {optimal_n_clusters}")

# Perform KMeans with the optimal number of clusters
kmeans_optimal = KMeans(n_clusters=optimal_n_clusters, random_state=42)
kmeans_optimal.fit(pca_scores_df)

# Plotting final clusters
plt.figure(figsize=(10, 8))
plt.scatter(pca_scores_df['Principal Component 1'], pca_scores_df['Principal Component 2'],
            c=kmeans_optimal.labels_, cmap='viridis', alpha=0.7)
plt.title(f'PCA Scores Clustered into {optimal_n_clusters} Groups')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

For n_clusters = 2, the average silhouette_score is: 0.5271
For n_clusters = 3, the average silhouette_score is: 0.6203
For n_clusters = 4, the average silhouette_score is: 0.6432
For n_clusters = 5, the average silhouette_score is: 0.6242
For n_clusters = 6, the average silhouette_score is: 0.6308

Optimal number of clusters based on silhouette score: 4

	title	genre	language	imdb_score	premiere	runtime	year
0	Notes for My Son	Drama	Spanish	6.3	2020-11-24	83	2020
1	To Each, Her Own	Romantic comedy	French	5.3	2018-06-24	95	2018
2	The Lovebirds	Romantic comedy	English	6.1	2020-05-22	87	2020
3	The Perfection	Horror-thriller	English	6.1	2019-05-24	90	2019
4	Happy Anniversary	Romantic comedy	English	5.8	2018-03-30	78	2018

Introduction¶

Data Loading and Preparation¶

Data Cleaning¶

Exploratory Data Analysis (EDA)¶

A. Overview of Numeric and Categorical Data¶

Correlation Analysis¶

Trend Analysis¶

Data Integration¶

Challenges¶

Handling Missing Data and Ensuring Data Sufficiency¶

Addressing NaN Values¶

Checking Data Sufficiency¶

Steps to Address Missing Data and Check Data Sufficiency¶

Revising the Analysis Objective¶

New Analysis Direction¶

New Hypothesis¶

Analysis Results¶

Findings¶

Interpretation¶

Understanding the P-Value of 0.0654¶

What Does a P-Value of 0.0654 Indicate?¶

Further Steps¶

Conclusion¶

Distribution Analysis¶

Objectives¶

Interpretation¶

Regression Analysis¶

Objectives¶

Detailed Interpretation of OLS Regression Results¶

Key Findings from the Regression Output:¶

Model Diagnostics and Considerations:¶

Conclusions and Further Steps:¶

Diagnostic Tests¶

Conclusions and Further Analysis¶

Cluster Analysis¶

Objectives¶

Evaluating the Optimal Number of Clusters: Why Multiple Methods Matter¶

Observations from Initial Analysis with Two Clusters¶

Need for Comprehensive Evaluation Methods¶

Elbow Method¶

Silhouette Method¶

Final Decision on Optimal Number of Clusters¶

Elbow Method Results¶

Silhouette Method Validation¶

Silhouette Scores:¶

Conclusion¶

Principal Component Analysis (PCA)¶

Objectives of PCA:¶

Benefits of Using PCA:¶

Analysis of PCA Results¶

Summary of PCA and Clustering Analysis¶

PCA Results¶

Clustering with Silhouette Scores¶

Final Clustering Visualization¶

Conclusion¶