Project: Netflix Prize Data

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
# pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

Cleaned Data

File: netflix_ratings.parquet

netflix = pd.read_parquet("data/netflix_ratings.parquet")

netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1862726 entries, 0 to 1862725
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  Int16         
 1   user_id   Int32         
 2   rating    Int8          
 3   date      datetime64[ns]
 4   title     object        
 5   genre     object        
 6   year      int64         
dtypes: Int16(1), Int32(1), Int8(1), datetime64[ns](1), int64(1), object(2)
memory usage: 74.6+ MB

pd.DataFrame?

netflix

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997
2	1744	1489846	3	2003-05-22	Beverly Hills Cop	[Action, Comedy, Crime]	1984
3	357	1169994	5	2004-04-22	House of Sand and Fog	[Crime, Drama]	2003
4	3256	722964	3	2004-03-08	Swimming Pool	[Crime, Drama, Mystery]	2003
...	...	...	...	...	...	...	...
1862721	1585	813354	3	2005-02-09	Joy Ride	[Action, Mystery, Thriller]	2001
1862722	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Sci-Fi]	1990
1862723	3782	1550938	3	2005-02-07	Flatliners	[Drama, Horror, Mystery]	1990
1862724	483	868452	3	2003-09-29	Rush Hour 2	[Action, Comedy, Crime]	2001
1862725	2782	1465983	4	2005-06-22	Braveheart	[Biography, Drama, War]	1995

1862726 rows × 7 columns

Data Wrangling

대략 다음과 같은 transform들을 조합하여 분석에 필요한 상태로 바꿈

변수들(열)과 관측치(행)를 선택: subsetting
조건에 맞는 부분(관측치, 행)만 필터링: query()
조건에 맞도록 행을 재정렬: sort_values()
변수들과 함수들을 이용하여 새로운 변수를 생성: assign()
카테고리별로 나뉘어진 데이터에 대한 통계치를 생성: groupby(), agg(), apply()

netflix_1990 = (
    netflix
    .loc[:, ["title", "rating", "date", "year"]]
    .query("year >= 1990")
    .assign(
        decade=lambda x: x["year"] // 10 * 10,  # 10년 단위
        weekday=lambda x: x["date"].dt.day_name().str[:3]  # 요일
    )
    .sort_values("year")
)
netflix_1990

	title	rating	date	year	decade	weekday
150148	Flatliners	3	2004-08-14	1990	1990	Sat
1269856	Look Who's Talking Too	3	2004-09-24	1990	1990	Fri
562709	Ghost	2	2004-12-28	1990	1990	Tue
1667117	The Grifters	3	2005-05-15	1990	1990	Sun
1269834	Ghost	4	2002-03-01	1990	1990	Fri
...	...	...	...	...	...	...
456802	Beauty Shop	5	2005-10-22	2005	2000	Sat
608408	Hostage	4	2005-10-10	2005	2000	Mon
456776	Hostage	5	2005-07-18	2005	2000	Mon
1818277	Coach Carter	4	2005-08-03	2005	2000	Wed
519199	The Hitchhiker's Guide to the Galaxy	3	2005-11-22	2005	2000	Tue

1496107 rows × 6 columns

평점 분포

mean_ratings = (
    netflix
    .groupby(["title"])["rating"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
mean_ratings

	title	mean	std	count
0	10	3.104	0.956	498
1	10 Things I Hate About You	3.728	0.992	4705
2	11:14	3.203	1.030	266
3	13 Ghosts	3.557	1.129	758
4	1984	3.367	1.131	488
...	...	...	...	...
820	Wonder Boys	3.552	0.968	3278
821	Wonderland	3.000	1.098	152
822	Wrongfully Accused	3.290	1.143	252
823	Yellow Submarine	3.575	1.105	784
824	Youngblood	3.256	1.029	328

825 rows × 4 columns

mean_ratings.sort_values("mean", ascending=False)

	title	mean	std	count
430	Paradise Lost: The Child Murders at Robin Hood...	4.440	0.651	25
734	The Sixth Sense	4.329	0.793	15166
733	The Silence of the Lambs	4.310	0.815	12940
101	Braveheart	4.289	0.901	13590
72	Batman Begins	4.215	0.860	5529
...	...	...	...	...
641	The Gunman	2.000	1.000	13
306	Inseminoid	2.000	1.054	10
468	Red Riding Hood	1.923	1.038	13
476	Rhinestone	1.909	1.063	66
562	Stuck on You	1.714	0.845	21

825 rows × 4 columns

(
    so.Plot(mean_ratings, x="mean", y="std")
    .add(so.Dots(alpha=.5), pointsize="count")
    .add(so.Line(), so.PolyFit(5))
    .scale(pointsize=(3, 20))
    .layout(size=(8, 7))
)

netflix2 = (
    netflix
    .assign(
        decade=lambda x: x["year"] // 10 * 10,  # 10년 단위
        weekday=lambda x: x["date"].dt.day_name().str[:3],
        weekend=lambda x: x["weekday"].isin(["Sat", "Sun"]),
        title_length=lambda x: x["title"].str.len()
    )
)

netflix2["weekday"] = netflix2["weekday"].astype("category").cat.set_categories(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])

mean_ratings_by_wday = (
    netflix2
    .groupby(["weekday"], observed=True)["rating"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
mean_ratings_by_wday

	weekday	mean	std	count
0	Mon	3.584	1.054	324006
1	Tue	3.582	1.054	331272
2	Wed	3.590	1.055	311012
3	Thu	3.593	1.057	267394
4	Fri	3.589	1.061	245186
5	Sat	3.596	1.065	184385
6	Sun	3.596	1.060	199471

요일별 평점

(
    so.Plot(mean_ratings_by_wday, x="weekday", y="mean")
    .add(so.Dot(), pointsize="count")
)

제목의 길이?

(
    so.Plot(netflix2, x="title_length", y="rating")
    .add(so.Dot(), so.Agg("mean"))
)

시청자별 분석

viewing_count = netflix.groupby("user_id")["rating"].size()
viewing_count

user_id
6          15
7          15
10          4
33          2
59          6
           ..
2649384     1
2649388     8
2649401     5
2649426     3
2649429     6
Name: rating, Length: 336915, dtype: Int64

# pandas의 method를 사용한 시각화
viewing_count.hist(bins=100);

viewing_count_df = viewing_count.reset_index(name="count")
(
    so.Plot(viewing_count_df, x="count")
    .add(so.Bars(), so.Hist(bins=50))
    .limit(y=(0, 100))
)

user_stats = (
    netflix
    .groupby("user_id")["rating"]
    .agg(["mean", "std", "count"])
)
user_stats

	mean	std	count
user_id
6	3.200	0.414	15
7	4.067	1.033	15
10	3.500	1.732	4
33	3.500	0.707	2
59	4.000	1.549	6
...	...	...	...
2649384	3.000	<NA>	1
2649388	3.125	0.835	8
2649401	4.200	1.095	5
2649426	3.667	0.577	3
2649429	4.000	0.894	6

336915 rows × 3 columns

user_stat_30 = user_stats.query("count >= 30")
user_stat_30

	mean	std	count
user_id
1333	2.674	0.778	43
2213	3.871	0.846	31
2455	3.433	0.817	30
2905	3.700	1.418	30
3321	2.977	1.012	43
...	...	...	...
2645579	3.935	0.680	46
2646347	3.263	1.032	38
2646574	3.119	0.803	42
2647197	3.389	1.153	36
2648287	3.600	0.847	35

3051 rows × 3 columns

(
    so.Plot(user_stat_30, x="mean", y="std")
    .add(so.Dot(alpha=.2))
    .add(so.Line(color=".5"), so.PolyFit(5))
    .layout(size=(8, 7))
)

장르별 분석

netflix["genre"]

0             [Comedy, Drama, Romance]
1           [Drama, Mystery, Thriller]
2              [Action, Comedy, Crime]
3                       [Crime, Drama]
4              [Crime, Drama, Mystery]
                      ...             
1862721    [Action, Mystery, Thriller]
1862722        [Drama, Horror, Sci-Fi]
1862723       [Drama, Horror, Mystery]
1862724        [Action, Comedy, Crime]
1862725        [Biography, Drama, War]
Name: genre, Length: 1862726, dtype: object

netflix_long = netflix.explode('genre')

netflix_long

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	Comedy	2004
0	3282	972104	4	2005-09-16	Sideways	Drama	2004
0	3282	972104	4	2005-09-16	Sideways	Romance	2004
1	143	2297762	5	2004-08-07	The Game	Drama	1997
1	143	2297762	5	2004-08-07	The Game	Mystery	1997
...	...	...	...	...	...	...	...
1862724	483	868452	3	2003-09-29	Rush Hour 2	Comedy	2001
1862724	483	868452	3	2003-09-29	Rush Hour 2	Crime	2001
1862725	2782	1465983	4	2005-06-22	Braveheart	Biography	1995
1862725	2782	1465983	4	2005-06-22	Braveheart	Drama	1995
1862725	2782	1465983	4	2005-06-22	Braveheart	War	1995

4806517 rows × 7 columns

(
    so.Plot(netflix_long, y="genre")  # y에 genre가 나오도록!
    .add(so.Bar(), so.Hist("proportion"))
)

genre_mean = (
    netflix_long
    .groupby('genre')['rating']
    .agg(['mean', 'std', 'count'])
    .reset_index()
)
genre_mean

	genre	mean	std	count
0	Action	3.557	1.050	490601
1	Adventure	3.576	1.064	316881
2	Animation	3.811	1.018	44297
3	Biography	3.878	0.995	121688
4	Comedy	3.528	1.067	770697
5	Crime	3.624	1.043	374283
6	Documentary	3.810	1.070	23833
7	Drama	3.661	1.045	939328
8	Family	3.588	1.057	125263
9	Fantasy	3.552	1.057	207485
10	Film-Noir	3.965	0.935	9020
11	History	3.550	1.014	16961
12	Horror	3.386	1.065	177556
13	Music	3.683	1.084	66609
14	Musical	3.616	1.033	24462
15	Mystery	3.577	1.043	206452
16	Romance	3.566	1.053	351176
17	Sci-Fi	3.332	1.090	115480
18	Sport	3.687	0.994	36883
19	Thriller	3.620	1.030	348897
20	War	3.908	1.037	32304
21	Western	3.577	1.045	6361

genre_mean.nlargest(3, "mean")

	genre	mean	std	count
10	Film-Noir	3.965	0.935	9020
20	War	3.908	1.037	32304
3	Biography	3.878	0.995	121688

order_by_mean = genre_mean.sort_values("mean", ascending=False)["genre"].values

(
    so.Plot(genre_mean, y="genre", x="mean")
    .add(so.Bar())
    .scale(y=so.Nominal(order=order_by_mean))  # 그래프에 순서 부여
    .limit(x=(3, 4.1))
)

(
    so.Plot(genre_mean, y="genre", x="std")
    .add(so.Bar())
    .scale(y=so.Nominal(order=order_by_mean))  # 그래프에 순서 부여
    .limit(x=(.9, 1.1))
)

netflix_long["weekday"] = netflix_long["date"].dt.day_name().str[:3]
netflix_long["weekday"] = netflix_long["weekday"].astype("category").cat.set_categories(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])

genre_mean_by_weekday = (
    netflix_long
    .groupby(['genre', 'weekday'], observed=True)['rating']
    .agg(['mean', 'std', 'count'])
    .reset_index()
)
genre_mean_by_weekday

	genre	weekday	mean	std	count
0	Action	Mon	3.555	1.045	85715
1	Action	Tue	3.549	1.043	86670
2	Action	Wed	3.561	1.047	82093
3	Action	Thu	3.556	1.051	70626
4	Action	Fri	3.548	1.059	64562
...	...	...	...	...	...
149	Western	Wed	3.579	1.043	1017
150	Western	Thu	3.621	1.063	931
151	Western	Fri	3.581	1.034	836
152	Western	Sat	3.555	1.015	632
153	Western	Sun	3.602	0.997	738

154 rows × 5 columns

order_by_mean = genre_mean.sort_values("mean", ascending=False)["genre"].values

(
    so.Plot(genre_mean_by_weekday, y="genre", x="mean", color="weekday")
    .add(so.Bar(), so.Dodge())
    .scale(y=so.Nominal(order=order_by_mean))  # 그래프에 순서 부여
    # .facet("weekday")
    .limit(x=(3.3, 4.1))
    .layout(size=(9, 9))
)

mean_ratings_by_genre = (
    netflix_long
    .groupby(["title", "genre"])["rating"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
mean_ratings_by_genre

	title	genre	mean	std	count
0	10	Comedy	3.104	0.956	498
1	10	Romance	3.104	0.956	498
2	10 Things I Hate About You	Comedy	3.728	0.992	4705
3	10 Things I Hate About You	Drama	3.728	0.992	4705
4	10 Things I Hate About You	Romance	3.728	0.992	4705
...	...	...	...	...	...
2199	Yellow Submarine	Animation	3.575	1.105	784
2200	Yellow Submarine	Comedy	3.575	1.105	784
2201	Youngblood	Drama	3.256	1.029	328
2202	Youngblood	Romance	3.256	1.029	328
2203	Youngblood	Sport	3.256	1.029	328

2204 rows × 5 columns

(
    so.Plot(mean_ratings_by_genre, x="mean", y="std")
    .add(so.Dots(alpha=.5), pointsize="count")
    .add(so.Line(color=".3"), so.PolyFit(1))
    .facet("genre", wrap=4)
    .share(y=False)
    .scale(pointsize=(3, 20))
    .layout(size=(9, 13))
)

출시년도

netflix.head(3)

	movie_id	user_id	rating	date	title	genre	year
0	3282	972104	4	2005-09-16	Sideways	[Comedy, Drama, Romance]	2004
1	143	2297762	5	2004-08-07	The Game	[Drama, Mystery, Thriller]	1997
2	1744	1489846	3	2003-05-22	Beverly Hills Cop	[Action, Comedy, Crime]	1984

netflix.groupby(["year", "title"])["rating"].agg(["mean", "count"])

		mean	count
year	title
1916	20,000 Leagues Under the Sea	3.704	162
1918	Chaplin	3.000	47
1922	Robin Hood	3.080	75
1927	It	4.067	15
1927	The Little Rascals	3.789	114
...	...	...	...
2005	The Amityville Horror	3.502	1947
	The Ballad of Jack and Rose	2.971	313
	The Hitchhiker's Guide to the Galaxy	2.997	2949
	The Pacifier	3.580	3966
	Unleashed	3.733	845

844 rows × 2 columns

(
    so.Plot(netflix, x="year", y="rating")
    .add(so.Line(marker="."), so.Agg("mean"))
)

# 10년 단위로 
netflix["decade"] = netflix["year"] // 10 * 10
(
    so.Plot(netflix, x="decade", y="rating")
    .add(so.Line(marker="."), so.Agg("mean"))
)