Project: Netflix Prize Data

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# pandas options
# pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
# pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from icecream import ic

movie_titles = pd.read_csv("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/netflix-prize-data-cleaned/movie_titles.csv")
ratings = pd.read_parquet("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/netflix-prize-data-cleaned/ratings_sample.parquet")

ratings = ratings.sample(frac=.2, random_state=123)

IMDB Movies Dataset

movie_titles_imdb = pd.read_csv("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/imdb-movies-data/2/imdb-movies-dataset.csv")

movie_titles_imdb.columns = movie_titles_imdb.columns.str.lower()

movie_titles_imdb.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   poster          10000 non-null  str    
 1   title           10000 non-null  str    
 2   year            9850 non-null   float64
 3   certificate     7370 non-null   str    
 4   duration (min)  9664 non-null   float64
 5   genre           9993 non-null   str    
 6   rating          9596 non-null   float64
 7   metascore       7555 non-null   float64
 8   director        9995 non-null   str    
 9   cast            9961 non-null   str    
 10  votes           9596 non-null   str    
 11  description     10000 non-null  str    
 12  review count    9999 non-null   str    
 13  review title    9483 non-null   str    
 14  review          9484 non-null   str    
dtypes: float64(4), str(11)
memory usage: 19.9 MB

movie_titles_imdb["year"].value_counts().sort_index().tail(10)

year
2016.000    300
2017.000    300
2018.000    200
2019.000    250
2020.000    350
2021.000    500
2022.000    350
2023.000    850
2024.000    650
2025.000    150
Name: count, dtype: int64

movie_titles["year"].value_counts().sort_index().tail(10)

year
1996     533
1997     653
1998     743
1999     965
2000    1234
2001    1184
2002    1310
2003    1271
2004    1436
2005     512
Name: count, dtype: int64

movie_titles

	movie_id	year	title
0	1	2003	Dinosaur Planet
1	2	2004	Isle of Man TT 2004 Review
2	3	1997	Character
3	4	1994	Paula Abdul's Get Up & Dance
4	5	2004	The Rise and Fall of ECW
...	...	...	...
17758	17766	2002	Where the Wild Things Are and Other Maurice Se...
17759	17767	2004	Fidel Castro: American Experience
17760	17768	2000	Epoch
17761	17769	2003	The Company
17762	17770	2003	Alien Hunter

17763 rows × 3 columns

movie_titles_imdb = movie_titles_imdb[["title", "year", "genre"]]

movie_titles_imdb

	title	year	genre
0	The Idea of You	2023.000	Comedy, Drama, Romance
1	Kingdom of the Planet of the Apes	2023.000	Action, Adventure, Sci-Fi
2	Unfrosted	2023.000	Biography, Comedy, History
3	The Fall Guy	2023.000	Action, Comedy, Drama
4	Challengers	2023.000	Drama, Romance, Sport
...	...	...	...
9995	The Greatest Show on Earth	2020.000	Drama, Family, Romance
9996	Berserk: Ougon Jidai-hen I - Haou no Tamago	2020.000	Animation, Action, Adventure
9997	Is-slottet	2020.000	Mystery, Drama
9998	Loving Pablo	2020.000	Biography, Crime, Drama
9999	Un homme et une femme	2020.000	Drama, Romance

10000 rows × 3 columns

movie_titles_imdb["genre"] = movie_titles_imdb["genre"].str.split(r"\s*,\s*")

movie_titles_imdb

	title	year	genre
0	The Idea of You	2023.000	[Comedy, Drama, Romance]
1	Kingdom of the Planet of the Apes	2023.000	[Action, Adventure, Sci-Fi]
2	Unfrosted	2023.000	[Biography, Comedy, History]
3	The Fall Guy	2023.000	[Action, Comedy, Drama]
4	Challengers	2023.000	[Drama, Romance, Sport]
...	...	...	...
9995	The Greatest Show on Earth	2020.000	[Drama, Family, Romance]
9996	Berserk: Ougon Jidai-hen I - Haou no Tamago	2020.000	[Animation, Action, Adventure]
9997	Is-slottet	2020.000	[Mystery, Drama]
9998	Loving Pablo	2020.000	[Biography, Crime, Drama]
9999	Un homme et une femme	2020.000	[Drama, Romance]

10000 rows × 3 columns

movie_titles_imdb.info()

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   10000 non-null  str    
 1   year    9850 non-null   float64
 2   genre   9993 non-null   object 
dtypes: float64(1), object(1), str(1)
memory usage: 380.6+ KB

movie_titles_imdb = movie_titles_imdb.drop_duplicates(subset=["title", "year"])

movie_titles.query("title == 'Planet of the Apes'")

	movie_id	year	title
4055	4056	2001	Planet of the Apes
16319	16325	1968	Planet of the Apes

movie_titles_imdb['year'] = movie_titles_imdb['year'].astype("Int32")

movie_titles_imdb.query("title == 'Planet of the Apes'")

	title	year	genre
34	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]

pd.merge(movie_titles_imdb, movie_titles, on="title")

	title	year_x	genre	movie_id	year_y
0	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]	4056	2001
1	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]	16325	1968
2	Road House	2023	[Action, Thriller]	10274	1989
3	Dune	2023	[Action, Adventure, Drama]	7952	2000
4	Dune	2023	[Action, Adventure, Drama]	17064	1984
...	...	...	...	...	...
3651	No Man's Land	2020	[Comedy, Drama, War]	12110	2001
3652	Beyond the Law	2020	[Action, Crime, Thriller]	6147	1992
3653	South Central	2020	[Crime, Drama]	11588	1992
3654	Mutiny on the Bounty	2020	[Adventure, Biography, Drama]	1129	1935
3655	The Greatest Show on Earth	2020	[Drama, Family, Romance]	14578	1952

3656 rows × 5 columns

movie_titles_merge = pd.merge(movie_titles_imdb.drop(columns=["year"]), movie_titles, on="title")
movie_titles_merge

	title	genre	movie_id	year
0	Planet of the Apes	[Action, Adventure, Sci-Fi]	4056	2001
1	Planet of the Apes	[Action, Adventure, Sci-Fi]	16325	1968
2	Road House	[Action, Thriller]	10274	1989
3	Dune	[Action, Adventure, Drama]	7952	2000
4	Dune	[Action, Adventure, Drama]	17064	1984
...	...	...	...	...
3651	No Man's Land	[Comedy, Drama, War]	12110	2001
3652	Beyond the Law	[Action, Crime, Thriller]	6147	1992
3653	South Central	[Crime, Drama]	11588	1992
3654	Mutiny on the Bounty	[Adventure, Biography, Drama]	1129	1935
3655	The Greatest Show on Earth	[Drama, Family, Romance]	14578	1952

3656 rows × 4 columns

ratings.head(2)

	movie_id	user_id	rating	date
137	173	1823259	2	2003-08-28
18815	3648	67900	3	2004-06-16

netflix_ratings = pd.merge(ratings, movie_titles_merge)
netflix_ratings

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997
2	1744	1489846	3	2003-05-22	Beverly Hills Cop	[Action, Comedy, Crime]	1984
3	357	1169994	5	2004-04-22	House of Sand and Fog	[Crime, Drama]	2003
4	3256	722964	3	2004-03-08	Swimming Pool	[Crime, Drama, Mystery]	2003
...	...	...	...	...	...	...	...
1862721	1585	813354	3	2005-02-09	Joy Ride	[Action, Mystery, Thriller]	2001
1862722	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Sci-Fi]	1990
1862723	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Mystery]	1990
1862724	483	868452	3	2003-09-29	Rush Hour 2	[Action, Comedy, Crime]	2001
1862725	2782	1465983	4	2005-06-22	Braveheart	[Biography, Drama, War]	1995

1862726 rows × 7 columns

netflix_ratings.to_parquet("data/netflix_ratings.parquet")