Project: Netflix Prize Data

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
# pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from icecream import ic

movie_titles = pd.read_csv("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/netflix-prize-data-cleaned/movie_titles.csv")
ratings = pd.read_parquet("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/netflix-prize-data-cleaned/ratings_sample.parquet")

ratings = ratings.sample(frac=.2, random_state=123)

movie_titles_imdb = pd.read_csv("/Users/skcho/Library/CloudStorage/Dropbox/Lectures/2025Fall/python-programming/py-programming/contents/final-projects/notes/data/imdb-movies-data/2/imdb-movies-dataset.csv")

movie_titles_imdb.columns = movie_titles_imdb.columns.str.lower()

movie_titles_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   poster          10000 non-null  object 
 1   title           10000 non-null  object 
 2   year            9850 non-null   float64
 3   certificate     7370 non-null   object 
 4   duration (min)  9664 non-null   float64
 5   genre           9993 non-null   object 
 6   rating          9596 non-null   float64
 7   metascore       7555 non-null   float64
 8   director        9995 non-null   object 
 9   cast            9961 non-null   object 
 10  votes           9596 non-null   object 
 11  description     10000 non-null  object 
 12  review count    9999 non-null   object 
 13  review title    9483 non-null   object 
 14  review          9484 non-null   object 
dtypes: float64(4), object(11)
memory usage: 1.1+ MB

movie_titles_imdb["year"].value_counts().sort_index().tail(10)

year
2016.000    300
2017.000    300
2018.000    200
2019.000    250
2020.000    350
2021.000    500
2022.000    350
2023.000    850
2024.000    650
2025.000    150
Name: count, dtype: int64

movie_titles["year"].value_counts().sort_index().tail(10)

year
1996     533
1997     653
1998     743
1999     965
2000    1234
2001    1184
2002    1310
2003    1271
2004    1436
2005     512
Name: count, dtype: int64

movie_titles

	movie_id	year	title
0	1	2003	Dinosaur Planet
1	2	2004	Isle of Man TT 2004 Review
2	3	1997	Character
3	4	1994	Paula Abdul's Get Up & Dance
4	5	2004	The Rise and Fall of ECW
...	...	...	...
17758	17766	2002	Where the Wild Things Are and Other Maurice Se...
17759	17767	2004	Fidel Castro: American Experience
17760	17768	2000	Epoch
17761	17769	2003	The Company
17762	17770	2003	Alien Hunter

17763 rows × 3 columns

movie_titles_imdb = movie_titles_imdb[["title", "year", "genre"]]

movie_titles_imdb

	title	year	genre
0	The Idea of You	2023.000	Comedy, Drama, Romance
1	Kingdom of the Planet of the Apes	2023.000	Action, Adventure, Sci-Fi
2	Unfrosted	2023.000	Biography, Comedy, History
3	The Fall Guy	2023.000	Action, Comedy, Drama
4	Challengers	2023.000	Drama, Romance, Sport
...	...	...	...
9995	The Greatest Show on Earth	2020.000	Drama, Family, Romance
9996	Berserk: Ougon Jidai-hen I - Haou no Tamago	2020.000	Animation, Action, Adventure
9997	Is-slottet	2020.000	Mystery, Drama
9998	Loving Pablo	2020.000	Biography, Crime, Drama
9999	Un homme et une femme	2020.000	Drama, Romance

10000 rows × 3 columns

movie_titles_imdb["genre"] = movie_titles_imdb["genre"].str.split(r"\s*,\s*")

movie_titles_imdb

	title	year	genre
0	The Idea of You	2023.000	[Comedy, Drama, Romance]
1	Kingdom of the Planet of the Apes	2023.000	[Action, Adventure, Sci-Fi]
2	Unfrosted	2023.000	[Biography, Comedy, History]
3	The Fall Guy	2023.000	[Action, Comedy, Drama]
4	Challengers	2023.000	[Drama, Romance, Sport]
...	...	...	...
9995	The Greatest Show on Earth	2020.000	[Drama, Family, Romance]
9996	Berserk: Ougon Jidai-hen I - Haou no Tamago	2020.000	[Animation, Action, Adventure]
9997	Is-slottet	2020.000	[Mystery, Drama]
9998	Loving Pablo	2020.000	[Biography, Crime, Drama]
9999	Un homme et une femme	2020.000	[Drama, Romance]

10000 rows × 3 columns

movie_titles_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   10000 non-null  object 
 1   year    9850 non-null   float64
 2   genre   9993 non-null   object 
dtypes: float64(1), object(2)
memory usage: 234.5+ KB

movie_titles_imdb = movie_titles_imdb.drop_duplicates(subset=["title", "year"])

movie_titles.query("title == 'Planet of the Apes'")

	movie_id	year	title
4055	4056	2001	Planet of the Apes
16319	16325	1968	Planet of the Apes

movie_titles_imdb['year'] = movie_titles_imdb['year'].astype("Int32")

movie_titles_imdb.query("title == 'Planet of the Apes'")

	title	year	genre
34	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]

pd.merge(movie_titles_imdb, movie_titles, on="title")

	title	year_x	genre	movie_id	year_y
0	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]	4056	2001
1	Planet of the Apes	2023	[Action, Adventure, Sci-Fi]	16325	1968
2	Road House	2023	[Action, Thriller]	10274	1989
3	Dune	2023	[Action, Adventure, Drama]	7952	2000
4	Dune	2023	[Action, Adventure, Drama]	17064	1984
...	...	...	...	...	...
3651	No Man's Land	2020	[Comedy, Drama, War]	12110	2001
3652	Beyond the Law	2020	[Action, Crime, Thriller]	6147	1992
3653	South Central	2020	[Crime, Drama]	11588	1992
3654	Mutiny on the Bounty	2020	[Adventure, Biography, Drama]	1129	1935
3655	The Greatest Show on Earth	2020	[Drama, Family, Romance]	14578	1952

3656 rows × 5 columns

movie_titles_merge = pd.merge(movie_titles_imdb.drop(columns=["year"]), movie_titles, on="title")
movie_titles_merge

	title	genre	movie_id	year
0	Planet of the Apes	[Action, Adventure, Sci-Fi]	4056	2001
1	Planet of the Apes	[Action, Adventure, Sci-Fi]	16325	1968
2	Road House	[Action, Thriller]	10274	1989
3	Dune	[Action, Adventure, Drama]	7952	2000
4	Dune	[Action, Adventure, Drama]	17064	1984
...	...	...	...	...
3651	No Man's Land	[Comedy, Drama, War]	12110	2001
3652	Beyond the Law	[Action, Crime, Thriller]	6147	1992
3653	South Central	[Crime, Drama]	11588	1992
3654	Mutiny on the Bounty	[Adventure, Biography, Drama]	1129	1935
3655	The Greatest Show on Earth	[Drama, Family, Romance]	14578	1952

3656 rows × 4 columns

ratings.head(2)

	movie_id	user_id	rating	date
137	173	1823259	2	2003-08-28
18815	3648	67900	3	2004-06-16

netflix_ratings = pd.merge(ratings, movie_titles_merge)
netflix_ratings

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997
2	1744	1489846	3	2003-05-22	Beverly Hills Cop	[Action, Comedy, Crime]	1984
3	357	1169994	5	2004-04-22	House of Sand and Fog	[Crime, Drama]	2003
4	3256	722964	3	2004-03-08	Swimming Pool	[Crime, Drama, Mystery]	2003
...	...	...	...	...	...	...	...
1862721	1585	813354	3	2005-02-09	Joy Ride	[Action, Mystery, Thriller]	2001
1862722	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Sci-Fi]	1990
1862723	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Mystery]	1990
1862724	483	868452	3	2003-09-29	Rush Hour 2	[Action, Comedy, Crime]	2001
1862725	2782	1465983	4	2005-06-22	Braveheart	[Biography, Drama, War]	1995

1862726 rows × 7 columns

netflix_ratings.to_parquet("data/netflix_ratings.parquet")

Cleaned Data

netflix = pd.read_parquet("data/netflix_ratings.parquet")

netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1862726 entries, 0 to 1862725
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  Int16         
 1   user_id   Int32         
 2   rating    Int8          
 3   date      datetime64[ns]
 4   title     object        
 5   genre     object        
 6   year      int64         
dtypes: Int16(1), Int32(1), Int8(1), datetime64[ns](1), int64(1), object(2)
memory usage: 74.6+ MB

netflix.head(2)

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997

평점 분포

mean_ratings = (
    netflix
    .groupby(["title"])["rating"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
mean_ratings

	title	mean	std	count
0	10	3.104	0.956	498
1	10 Things I Hate About You	3.728	0.992	4705
2	11:14	3.203	1.030	266
3	13 Ghosts	3.557	1.129	758
4	1984	3.367	1.131	488
...	...	...	...	...
820	Wonder Boys	3.552	0.968	3278
821	Wonderland	3.000	1.098	152
822	Wrongfully Accused	3.290	1.143	252
823	Yellow Submarine	3.575	1.105	784
824	Youngblood	3.256	1.029	328

825 rows × 4 columns

mean_ratings.sort_values("mean", ascending=False)

	title	mean	std	count
430	Paradise Lost: The Child Murders at Robin Hood...	4.440	0.651	25
734	The Sixth Sense	4.329	0.793	15166
733	The Silence of the Lambs	4.310	0.815	12940
101	Braveheart	4.289	0.901	13590
72	Batman Begins	4.215	0.860	5529
...	...	...	...	...
641	The Gunman	2.000	1.000	13
306	Inseminoid	2.000	1.054	10
468	Red Riding Hood	1.923	1.038	13
476	Rhinestone	1.909	1.063	66
562	Stuck on You	1.714	0.845	21

825 rows × 4 columns

(
    so.Plot(mean_ratings, x="mean", y="std")
    .add(so.Dots(alpha=.3), pointsize="count")
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(9, 8))
)

(
    so.Plot(mean_ratings, x="mean", y="std")
    .add(so.Dots(alpha=.3), pointsize="count")
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(9, 8))
)

시청자별 분석

viewing_count = netflix.groupby("user_id")["rating"].size()
viewing_count

user_id
6          15
7          15
10          4
33          2
59          6
           ..
2649384     1
2649388     8
2649401     5
2649426     3
2649429     6
Name: rating, Length: 336915, dtype: Int64

# pandas의 method를 사용한 시각화
viewing_count.hist(bins=100);

viewing_count_df = viewing_count.reset_index(name="count")
(
    so.Plot(viewing_count_df, x="count")
    .add(so.Bars(), so.Hist(bins=100))
    .limit(y=(0, 100))
)

user_stats = (
    netflix
    .groupby("user_id")["rating"]
    .agg(["mean", "std", "count"])
)
user_stats

	mean	std	count
user_id
6	3.200	0.414	15
7	4.067	1.033	15
10	3.500	1.732	4
33	3.500	0.707	2
59	4.000	1.549	6
...	...	...	...
2649384	3.000	<NA>	1
2649388	3.125	0.835	8
2649401	4.200	1.095	5
2649426	3.667	0.577	3
2649429	4.000	0.894	6

336915 rows × 3 columns

user_stat_30 = user_stats.query("count >= 30")
user_stat_30

	mean	std	count
user_id
1333	2.674	0.778	43
2213	3.871	0.846	31
2455	3.433	0.817	30
2905	3.700	1.418	30
3321	2.977	1.012	43
...	...	...	...
2645579	3.935	0.680	46
2646347	3.263	1.032	38
2646574	3.119	0.803	42
2647197	3.389	1.153	36
2648287	3.600	0.847	35

3051 rows × 3 columns

(
    so.Plot(user_stat_30, x="mean", y="std")
    .add(so.Dot(alpha=.2))
    .add(so.Line(color=".2"), so.PolyFit(5))
    .layout(size=(9, 8))
    .scale(pointsize=(3, 10))
)

장르별 분석

netflix["genre"]

0             [Comedy, Drama, Romance]
1           [Drama, Mystery, Thriller]
2              [Action, Comedy, Crime]
3                       [Crime, Drama]
4              [Crime, Drama, Mystery]
                      ...             
1862721    [Action, Mystery, Thriller]
1862722        [Drama, Horror, Sci-Fi]
1862723       [Drama, Horror, Mystery]
1862724        [Action, Comedy, Crime]
1862725        [Biography, Drama, War]
Name: genre, Length: 1862726, dtype: object

netflix_long = netflix.explode('genre')

netflix_long

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	Comedy	2004
0	3282	972104	4	2005-09-16	Sideways	Drama	2004
0	3282	972104	4	2005-09-16	Sideways	Romance	2004
1	143	2297762	5	2004-08-07	The Game	Drama	1997
1	143	2297762	5	2004-08-07	The Game	Mystery	1997
...	...	...	...	...	...	...	...
1862724	483	868452	3	2003-09-29	Rush Hour 2	Comedy	2001
1862724	483	868452	3	2003-09-29	Rush Hour 2	Crime	2001
1862725	2782	1465983	4	2005-06-22	Braveheart	Biography	1995
1862725	2782	1465983	4	2005-06-22	Braveheart	Drama	1995
1862725	2782	1465983	4	2005-06-22	Braveheart	War	1995

4806517 rows × 7 columns

(
    so.Plot(netflix_long, y="genre")  # y에 genre가 나오도록!
    .add(so.Bar(), so.Hist("proportion"))
)

genre_mean = (
    netflix_long
    .groupby('genre')['rating']
    .agg(['mean', 'std', 'count'])
    .reset_index()
)
genre_mean

	genre	mean	std	count
0	Action	3.557	1.050	490601
1	Adventure	3.576	1.064	316881
2	Animation	3.811	1.018	44297
3	Biography	3.878	0.995	121688
4	Comedy	3.528	1.067	770697
5	Crime	3.624	1.043	374283
6	Documentary	3.810	1.070	23833
7	Drama	3.661	1.045	939328
8	Family	3.588	1.057	125263
9	Fantasy	3.552	1.057	207485
10	Film-Noir	3.965	0.935	9020
11	History	3.550	1.014	16961
12	Horror	3.386	1.065	177556
13	Music	3.683	1.084	66609
14	Musical	3.616	1.033	24462
15	Mystery	3.577	1.043	206452
16	Romance	3.566	1.053	351176
17	Sci-Fi	3.332	1.090	115480
18	Sport	3.687	0.994	36883
19	Thriller	3.620	1.030	348897
20	War	3.908	1.037	32304
21	Western	3.577	1.045	6361

genre_mean.sort_values("count", ascending=False).tail(10)

	genre	mean	std	count
17	Sci-Fi	3.332	1.090	115480
13	Music	3.683	1.084	66609
2	Animation	3.811	1.018	44297
18	Sport	3.687	0.994	36883
20	War	3.908	1.037	32304
14	Musical	3.616	1.033	24462
6	Documentary	3.810	1.070	23833
11	History	3.550	1.014	16961
10	Film-Noir	3.965	0.935	9020
21	Western	3.577	1.045	6361

genre_mean.sort_values("mean", ascending=False)

	genre	mean	std	count
10	Film-Noir	3.965	0.935	9020
20	War	3.908	1.037	32304
3	Biography	3.878	0.995	121688
2	Animation	3.811	1.018	44297
6	Documentary	3.810	1.070	23833
18	Sport	3.687	0.994	36883
13	Music	3.683	1.084	66609
7	Drama	3.661	1.045	939328
5	Crime	3.624	1.043	374283
19	Thriller	3.620	1.030	348897
14	Musical	3.616	1.033	24462
8	Family	3.588	1.057	125263
21	Western	3.577	1.045	6361
15	Mystery	3.577	1.043	206452
1	Adventure	3.576	1.064	316881
16	Romance	3.566	1.053	351176
0	Action	3.557	1.050	490601
9	Fantasy	3.552	1.057	207485
11	History	3.550	1.014	16961
4	Comedy	3.528	1.067	770697
12	Horror	3.386	1.065	177556
17	Sci-Fi	3.332	1.090	115480

order_by_mean = genre_mean.sort_values("mean", ascending=False)["genre"].values

(
    so.Plot(genre_mean, y="genre", x="mean")
    .add(so.Bar(), so.Dodge())
    .scale(y=so.Nominal(order=order_by_mean))  # 그래프에 순서 부여
    .limit(x=(3, 4.1))
)

(
    so.Plot(genre_mean, y="genre", x="std")
    .add(so.Bar(), so.Dodge())
    .scale(y=so.Nominal(order=order_by_mean))  # 그래프에 순서 부여
    .limit(x=(.9, 1.1))
)

# 평균과 표준편차를 error bar를 이용해 볼 수도 있음.
(
    so.Plot(netflix_long, x="rating", y="genre")
    .add(so.Range(), so.Est("mean", errorbar="sd"))
    .add(so.Dot(pointsize=3), so.Agg("mean"))
    .scale(y=so.Nominal(order=order_by_mean)) # 그래프에 순서 부여
)

출시년도

netflix.head(2)

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997

netflix.groupby(["year", "title"])["rating"].agg(["mean", "count"])

		mean	count
year	title
1916	20,000 Leagues Under the Sea	3.704	162
1918	Chaplin	3.000	47
1922	Robin Hood	3.080	75
1927	It	4.067	15
1927	The Little Rascals	3.789	114
...	...	...	...
2005	The Amityville Horror	3.502	1947
	The Ballad of Jack and Rose	2.971	313
	The Hitchhiker's Guide to the Galaxy	2.997	2949
	The Pacifier	3.580	3966
	Unleashed	3.733	845

844 rows × 2 columns

(
    so.Plot(netflix, x="year", y="rating")
    .add(so.Line(marker="."), so.Agg("mean"))
)

# 10년 단위로 
netflix["decade"] = netflix["year"] // 10 * 10
(
    so.Plot(netflix, x="decade", y="rating")
    .add(so.Line(marker="."), so.Agg("mean"))
)