Project: Netflix Prize Data

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

from icecream import ic

Project: Netflix Prize Data

Dataset 링크

Rating File: combined_data_1.txt, …

CustomerID,Rating,Date
MovieIDs range from 1 to 17770 sequentially.
CustomerIDs range from 1 to 2649429, with gaps. There are 480189 users.
Ratings are on a five star (integral) scale from 1 to 5.
Dates have the format YYYY-MM-DD.

Movie Titles: movie_titles.csv

MovieID,YearOfRelease,Title

데이터 다운로드 및 가져오기

import kagglehub

# Download latest version
path = kagglehub.dataset_download("netflix-inc/netflix-prize-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/netflix-inc/netflix-prize-data?dataset_version_number=2...

100%|██████████| 683M/683M [00:31<00:00, 22.5MB/s]

Extracting files...

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/netflix-inc/netflix-prize-data/versions/2

import shutil
from pathlib import Path

# 현재 디렉토리에 data 폴더 생성
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# 파일 이동
shutil.move(path, "data/netflix-prize-data")

'data/netflix-prize-data'

Movie Titles 데이터

표준입력으로 읽기

with open("data/netflix-prize-data/movie_titles.csv") as f:
    for _ in range(5):
        print(f.readline(), end='')

1,2003,Dinosaur Planet
2,2004,Isle of Man TT 2004 Review
3,1997,Character
4,1994,Paula Abdul's Get Up & Dance
5,2004,The Rise and Fall of ECW

with open("data/netflix-prize-data/movie_titles.csv", encoding="latin-1") as f:
    for i, line in enumerate(f):
        if 68 < i < 72:
            print(line, end='')
        if i > 72:
            break

70,1999,Tai Chi: The 24 Forms
71,1995,Maya Lin: A Strong Clear Vision
72,1974,At Home Among Strangers, A Stranger Among His Own

csv 모듈로 읽기

import csv

with open("data/netflix-prize-data/movie_titles.csv") as f:
    reader = csv.reader(f)
    for _ in range(5):
        print(next(reader))

['1', '2003', 'Dinosaur Planet']
['2', '2004', 'Isle of Man TT 2004 Review']
['3', '1997', 'Character']
['4', '1994', "Paula Abdul's Get Up & Dance"]
['5', '2004', 'The Rise and Fall of ECW']

from csv import DictReader

with open("data/netflix-prize-data/movie_titles.csv", encoding='latin-1') as f:
    results = [
        fields for fields in DictReader(f, fieldnames=['movie_id', 'year', 'title'])
    ]
results[:5]

[{'movie_id': '1', 'year': '2003', 'title': 'Dinosaur Planet'},
 {'movie_id': '2', 'year': '2004', 'title': 'Isle of Man TT 2004 Review'},
 {'movie_id': '3', 'year': '1997', 'title': 'Character'},
 {'movie_id': '4', 'year': '1994', 'title': "Paula Abdul's Get Up & Dance"},
 {'movie_id': '5', 'year': '2004', 'title': 'The Rise and Fall of ECW'}]

pandas로 읽기

# Fails
movie_titles = pd.read_csv(
    "data/netflix-prize-data/movie_titles.csv", 
    encoding="latin-1",
    header=None,
    names=['movie_id', 'year', 'title'],
)

---------------------------------------------------------------------------
ParserError                               Traceback (most recent call last)
Cell In[10], line 2
      1 # Fails
----> 2 movie_titles = pd.read_csv(
      3     "data/netflix-prize-data/movie_titles.csv", 
      4     encoding="latin-1",
      5     header=None,
      6     names=['movie_id', 'year', 'title'],
      7 )

File /workspaces/codespaces-jupyter/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /workspaces/codespaces-jupyter/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:626, in _read(filepath_or_buffer, kwds)
    623     return parser
    625 with parser:
--> 626     return parser.read(nrows)

File /workspaces/codespaces-jupyter/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1923, in TextFileReader.read(self, nrows)
   1916 nrows = validate_integer("nrows", nrows)
   1917 try:
   1918     # error: "ParserBase" has no attribute "read"
   1919     (
   1920         index,
   1921         columns,
   1922         col_dict,
-> 1923     ) = self._engine.read(  # type: ignore[attr-defined]
   1924         nrows
   1925     )
   1926 except Exception:
   1927     self.close()

File /workspaces/codespaces-jupyter/.venv/lib/python3.12/site-packages/pandas/io/parsers/c_parser_wrapper.py:234, in CParserWrapper.read(self, nrows)
    232 try:
    233     if self.low_memory:
--> 234         chunks = self._reader.read_low_memory(nrows)
    235         # destructive to chunks
    236         data = _concatenate_chunks(chunks)

File pandas/_libs/parsers.pyx:838, in pandas._libs.parsers.TextReader.read_low_memory()

File pandas/_libs/parsers.pyx:905, in pandas._libs.parsers.TextReader._read_rows()

File pandas/_libs/parsers.pyx:874, in pandas._libs.parsers.TextReader._tokenize_rows()

File pandas/_libs/parsers.pyx:891, in pandas._libs.parsers.TextReader._check_tokenize_status()

File pandas/_libs/parsers.pyx:2061, in pandas._libs.parsers.raise_parser_error()

ParserError: Error tokenizing data. C error: Expected 3 fields in line 72, saw 4

# 영화 제목에 쉼표가 포함될 수 있으므로 수동 파싱
data = []
with open("data/netflix-prize-data/movie_titles.csv", encoding="latin-1") as f:
    for line in f:
        # 첫 번째와 두 번째 쉼표로만 split (movie_id, year, title)
        parts = line.strip().split(',', maxsplit=2)  # 최대 3개로 분리
        if len(parts) == 3:
            movie_id, year, title = parts
            try:
                data.append([int(movie_id), int(year), title])
            except ValueError:
                ic(line)
        else:
            # 예외 처리: 필드가 3개가 아닌 경우
            print(f"Warning: Unexpected line format: {line.strip()}")

movie_titles = pd.DataFrame(data, columns=['movie_id', 'year', 'title'])

ic| line: '''4388,NULL,Ancient Civilizations: Rome and Pompeii

           '''

ic| line: '''4794,NULL,Ancient Civilizations: Land of the Pharaohs

           '''

ic| line: '''7241,NULL,Ancient Civilizations: Athens and Greece

           '''

ic| line: '''10782,NULL,Roti Kapada Aur Makaan

           '''

ic| line: '''15918,NULL,Hote Hote Pyaar Ho Gaya

           '''

ic| line: '''16678,NULL,Jimmy Hollywood

           '''

ic| line: '''17667,NULL,Eros Dance Dhamaka

           '''

movie_titles

	movie_id	year	title
0	1	2003	Dinosaur Planet
1	2	2004	Isle of Man TT 2004 Review
2	3	1997	Character
...	...	...	...
17760	17768	2000	Epoch
17761	17769	2003	The Company
17762	17770	2003	Alien Hunter

17763 rows × 3 columns

## make a directory for cleaned data
import os
os.makedirs("data/netflix-prize-data-cleaned", exist_ok=True)

## export movie_titles
# csv
movie_titles.to_csv("data/netflix-prize-data-cleaned/movie_titles.csv", index=False)

# json
movie_titles.to_json("data/netflix-prize-data-cleaned/movie_titles.json", orient="records")

# parquet
movie_titles.to_parquet("data/netflix-prize-data-cleaned/movie_titles.parquet")

모듈을 직접 이용해 export

json

import json

# DataFrame을 records 형식의 딕셔너리 리스트로 변환
data = movie_titles.to_dict(orient='records')

# json 모듈로 파일에 쓰기
with open("data/netflix-prize-data-cleaned/movie_titles.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False)  # ensure_ascii=False: 한글 등 유니코드 문자를 그대로 저장

parquet

import pyarrow as pa
import pyarrow.parquet as pq

# DataFrame을 PyArrow Table로 변환
table = pa.Table.from_pandas(movie_titles)

# Parquet 파일로 저장
pq.write_table(table, "data/netflix-prize-data-cleaned/movie_titles.parquet")

print(f"movie_titles 데이터의 메모리 사이즈: {movie_titles.memory_usage().sum() / 1024**2:.2f} MB")

movie_titles 데이터의 메모리 사이즈: 0.41 MB

Ratings 데이터

combined_data_1.txt

with open("data/netflix-prize-data/combined_data_1.txt") as f:
    for _ in range(10):
        print(f.readline(), end='')

1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248029,3,2004-04-22
1842128,4,2004-05-09

# Rating 데이터를 딕셔너리로 파싱
# {movie_id: [(user_id, rating, date), ...]}

ratings_dict = {}
current_movie_id = None

with open("data/netflix-prize-data/combined_data_1.txt") as f:
    for line in f:
        line = line.strip()
        
        # movie_id: 형식인지 확인 (콜론으로 끝남)
        if line.endswith(':'):
            current_movie_id = int(line[:-1])  # 콜론 제거하고 정수로 변환
            ratings_dict[current_movie_id] = []  # 빈 리스트 초기화
        else:
            # user_id, rating, date 파싱
            parts = line.split(',')
            if len(parts) == 3:
                user_id, rating, date = parts
                ratings_dict[current_movie_id].append((int(user_id), int(rating), date))

# 결과 확인: 처음 3개 영화의 데이터
for movie_id in list(ratings_dict.keys())[:3]:
    print(f"movie_id {movie_id}: {len(ratings_dict[movie_id])} ratings")
    print(f"  처음 3개: {ratings_dict[movie_id][:3]}")

movie_id 1: 547 ratings
  처음 3개: [(1488844, 3, '2005-09-06'), (822109, 5, '2005-05-13'), (885013, 4, '2005-10-19')]
movie_id 2: 145 ratings
  처음 3개: [(2059652, 4, '2005-09-05'), (1666394, 3, '2005-04-19'), (1759415, 4, '2005-04-22')]
movie_id 3: 2012 ratings
  처음 3개: [(1025579, 4, '2003-03-29'), (712664, 5, '2004-02-01'), (1331154, 4, '2004-07-03')]

# pip install Pympler
from pympler import asizeof

# 메모리 측정
dict_memory = asizeof.asizeof(ratings_dict)
print(f"ratings_dict 데이터의 메모리 사이즈: {dict_memory / 1024**2:.2f} MB")

ratings_dict 데이터의 메모리 사이즈: 3682.36 MB

import pickle

with open("data/netflix-prize-data-cleaned/ratings_dict.pkl", "wb") as f:
    pickle.dump(ratings_dict, f)

import json

with open("data/netflix-prize-data-cleaned/ratings_dict.json", "w") as f:
    json.dump(ratings_dict, f)

# check file size
import os

file_size_pkl = os.path.getsize("data/netflix-prize-data-cleaned/ratings_dict.pkl")
file_size_json = os.path.getsize("data/netflix-prize-data-cleaned/ratings_dict.json")

print(f"File size of ratings_dict.pkl: {file_size_pkl / 1024**2:.2f} MB")
print(f"File size of ratings_dict.json: {file_size_json / 1024**2:.2f} MB")

File size of ratings_dict.pkl: 503.69 MB
File size of ratings_dict.json: 632.69 MB

pandas의 dataframe으로 변환

dic1 = dict(list(ratings_dict.items())[:3])
ratings_df1 = pd.DataFrame(dic1[1], columns=['user_id', 'rating', 'date'])
ratings_df1

	user_id	rating	date
0	1488844	3	2005-09-06
1	822109	5	2005-05-13
2	885013	4	2005-10-19
...	...	...	...
544	1535440	4	2005-08-18
545	1426604	4	2005-09-01
546	1815755	5	2004-07-20

547 rows × 3 columns

ratings_df = []
for key in ratings_dict:
    _df = pd.DataFrame(ratings_dict[key], columns=['user_id', 'rating', 'date'])
    _df['movie_id'] = key
    ratings_df.append(_df)
ratings_df = pd.concat(ratings_df)

ratings_df

	user_id	rating	date	movie_id
0	1488844	3	2005-09-06	1
1	822109	5	2005-05-13	1
2	885013	4	2005-10-19	1
...	...	...	...	...
425	512536	5	2005-07-27	4499
426	988963	3	2005-12-20	4499
427	1704416	3	2004-06-02	4499

24053764 rows × 4 columns

ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 0 to 427
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   rating    int64 
 2   date      object
 3   movie_id  int64 
dtypes: int64(3), object(1)
memory usage: 917.6+ MB

각 컬럼의 타입 변형

ratings_df['user_id'] = ratings_df['user_id'].astype('Int32')
ratings_df['rating'] = ratings_df['rating'].astype('Int8')
ratings_df['movie_id'] = ratings_df['movie_id'].astype('Int16')
ratings_df['date'] = pd.to_datetime(ratings_df['date'])

ratings_df = ratings_df[['movie_id', 'user_id', 'rating', 'date']]

ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 0 to 427
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  Int16         
 1   user_id   Int32         
 2   rating    Int8          
 3   date      datetime64[ns]
dtypes: Int16(1), Int32(1), Int8(1), datetime64[ns](1)
memory usage: 596.4 MB

parquet로 저장하기

ratings를 parquet 파일로 저장
pyarrow 필요: pip install pyarrow

ratings_df.to_parquet("data/netflix-prize-data-cleaned/ratings_df.parquet")

ratings_df.to_parquet("data/netflix-prize-data-cleaned/ratings_df.parquet")

# check file size
import os

file_size_parquet = os.path.getsize("data/netflix-prize-data-cleaned/ratings_df.parquet")
print(f"File size of ratings_df.parquet: {file_size_parquet / 1024**2:.2f} MB")

File size of ratings_df.parquet: 200.46 MB

# to json
ratings_df.to_json("data/netflix-prize-data-cleaned/ratings_df.json", orient="records")

file_size_df_json = os.path.getsize("data/netflix-prize-data-cleaned/ratings_df.json")
print(f"ratings_df.json 파일의 크기: {file_size_df_json / 1024**2:.2f} MB")

ratings_df.json 파일의 크기: 1544.69 MB

50%만 선택

ratings_sample = ratings_df.sample(frac=0.5, random_state=123)

ratings_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12026882 entries, 250 to 5350
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  Int16         
 1   user_id   Int32         
 2   rating    Int8          
 3   date      datetime64[ns]
dtypes: Int16(1), Int32(1), Int8(1), datetime64[ns](1)
memory usage: 298.2 MB

(
    ratings_df
    .groupby('movie_id')
    .size()
    .sort_values(ascending=False)
    .reset_index(name='count')
    .query('count < 2000')
    .loc[:, 'count']
    .hist(bins=100)
);

(
    ratings_sample
    .groupby('movie_id')
    .size()
    .sort_values(ascending=False)
    .reset_index(name='count')
    .query('count < 1000')
    .loc[:, 'count']
    .hist(bins=100)
);

ratings_sample.to_parquet("data/netflix-prize-data-cleaned/ratings_sample.parquet")
ratings_df

	movie_id	user_id	rating	date
0	1	1488844	3	2005-09-06
1	1	822109	5	2005-05-13
2	1	885013	4	2005-10-19
...	...	...	...	...
425	4499	512536	5	2005-07-27
426	4499	988963	3	2005-12-20
427	4499	1704416	3	2004-06-02

24053764 rows × 4 columns

Movie info 데이터

데이터셋 링크

import kagglehub
import shutil
path = kagglehub.dataset_download("shivamb/netflix-shows")

# data 폴더로 이동
shutil.move(path, "data/netflix-shows")

Downloading from https://www.kaggle.com/api/v1/datasets/download/shivamb/netflix-shows?dataset_version_number=5...

100%|██████████| 1.34M/1.34M [00:01<00:00, 1.32MB/s]

Extracting files...

'data/netflix-shows'

netflix_info = pd.read_csv("data/netflix-shows/netflix_titles.csv")
netflix_info

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	NaN	United States	September 25, 2021	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...
1	s2	TV Show	Blood & Water	NaN	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	NaN	September 24, 2021	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...
...	...	...	...	...	...	...	...	...	...	...	...	...
8804	s8805	Movie	Zombieland	Ruben Fleischer	Jesse Eisenberg, Woody Harrelson, Emma Stone, ...	United States	November 1, 2019	2009	R	88 min	Comedies, Horror Movies	Looking to survive in a world taken over by zo...
8805	s8806	Movie	Zoom	Peter Hewitt	Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...	United States	January 11, 2020	2006	PG	88 min	Children & Family Movies, Comedies	Dragged from civilian life, a former superhero...
8806	s8807	Movie	Zubaan	Mozez Singh	Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...	India	March 2, 2019	2015	TV-14	111 min	Dramas, International Movies, Music & Musicals	A scrappy but poor boy worms his way into a ty...

8807 rows × 12 columns

netflix_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB

정리한 데이터 가져오기

movie_titles = pd.read_csv("data/netflix-prize-data-cleaned/movie_titles.csv")
ratings = pd.read_parquet("data/netflix-prize-data-cleaned/ratings_sample.parquet")
netflix_info = pd.read_csv("data/netflix-shows/netflix_titles.csv")

movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17763 entries, 0 to 17762
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  17763 non-null  int64 
 1   year      17763 non-null  int64 
 2   title     17763 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.4+ KB

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12026882 entries, 250 to 5350
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   movie_id  Int16         
 1   user_id   Int32         
 2   rating    Int8          
 3   date      datetime64[ns]
dtypes: Int16(1), Int32(1), Int8(1), datetime64[ns](1)
memory usage: 298.2 MB

netflix_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB

# duration (min) 컬럼의 이름을 duration으로 변경
netflix_info.rename(columns={'release_year': 'year', 'rating': 'MPAA_rating'}, inplace=True)

# type 변형
netflix_info['year'] = netflix_info['year'].astype('Int16')

duration컬럼 정리

netflix_info.loc[netflix_info['duration'].str.endswith('min').isna(), :]

	show_id	type	title	director	cast	country	date_added	year	MPAA_rating	duration	listed_in	description
5541	s5542	Movie	Louis C.K. 2017	Louis C.K.	Louis C.K.	United States	April 4, 2017	2017	74 min	NaN	Movies	Louis C.K. muses on religion, eternal love, gi...
5794	s5795	Movie	Louis C.K.: Hilarious	Louis C.K.	Louis C.K.	United States	September 16, 2016	2010	84 min	NaN	Movies	Emmy-winning comedy writer Louis C.K. brings h...
5813	s5814	Movie	Louis C.K.: Live at the Comedy Store	Louis C.K.	Louis C.K.	United States	August 15, 2016	2015	66 min	NaN	Movies	The comic puts his trademark hilarious/thought...

idx = netflix_info['duration'].str.endswith('min').isna()

netflix_info.loc[idx, 'duration'] = netflix_info.loc[idx, 'MPAA_rating']
netflix_info.loc[idx, 'MPAA_rating'] = pd.NA

netflix_info.loc[idx, :]

	show_id	type	title	director	cast	country	date_added	year	MPAA_rating	duration	listed_in	description
5541	s5542	Movie	Louis C.K. 2017	Louis C.K.	Louis C.K.	United States	April 4, 2017	2017	<NA>	74 min	Movies	Louis C.K. muses on religion, eternal love, gi...
5794	s5795	Movie	Louis C.K.: Hilarious	Louis C.K.	Louis C.K.	United States	September 16, 2016	2010	<NA>	84 min	Movies	Emmy-winning comedy writer Louis C.K. brings h...
5813	s5814	Movie	Louis C.K.: Live at the Comedy Store	Louis C.K.	Louis C.K.	United States	August 15, 2016	2015	<NA>	66 min	Movies	The comic puts his trademark hilarious/thought...

netflix_info

	show_id	type	title	director	cast	country	date_added	year	MPAA_rating	duration	listed_in	description
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	NaN	United States	September 25, 2021	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...
1	s2	TV Show	Blood & Water	NaN	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	NaN	September 24, 2021	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...
...	...	...	...	...	...	...	...	...	...	...	...	...
8804	s8805	Movie	Zombieland	Ruben Fleischer	Jesse Eisenberg, Woody Harrelson, Emma Stone, ...	United States	November 1, 2019	2009	R	88 min	Comedies, Horror Movies	Looking to survive in a world taken over by zo...
8805	s8806	Movie	Zoom	Peter Hewitt	Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...	United States	January 11, 2020	2006	PG	88 min	Children & Family Movies, Comedies	Dragged from civilian life, a former superhero...
8806	s8807	Movie	Zubaan	Mozez Singh	Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...	India	March 2, 2019	2015	TV-14	111 min	Dramas, International Movies, Music & Musicals	A scrappy but poor boy worms his way into a ty...

8807 rows × 12 columns

# min으로 끝나는 경우만 추출
idx = netflix_info['duration'].str.endswith('min')

# min으로 끝나는 경우: 'min' 제거하고 정수로 변환하여 duration_min 컬럼 생성 (나머지는 NA)
netflix_info.loc[idx, 'duration_min'] = netflix_info.loc[idx, 'duration'].str.replace(' min', '').astype('Int32')

# min으로 끝나지 않는 경우: 숫자만 추출하여 season 컬럼 생성 (나머지는 NA)
netflix_info.loc[~idx, 'season'] = netflix_info.loc[~idx, 'duration'].str.extract(r'(\d+)')[0].astype('Int32')

# min으로 끝나는 경우/아닌 경우 구분
idx = netflix_info['duration'].str.endswith('min')

# duration_min: min 제거 후 정수 변환, 나머지는 NA
netflix_info['duration_min'] = (
    netflix_info['duration']
    .where(idx)
    .str.replace('min', '', regex=False)
    .astype('Int32')
)

# season: min이 아닌 경우 숫자 추출 후 정수 변환, 나머지는 NA
netflix_info['season'] = (
    netflix_info['duration']
    .where(~idx)
    .str.extract(r'(\d+)', expand=False)
    .astype('Int32')
)

netflix_info

	show_id	type	title	director	cast	country	date_added	year	MPAA_rating	duration	listed_in	description	duration_min	season
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	NaN	United States	September 25, 2021	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...	90	<NA>
1	s2	TV Show	Blood & Water	NaN	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...	<NA>	2
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	NaN	September 24, 2021	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...	<NA>	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8804	s8805	Movie	Zombieland	Ruben Fleischer	Jesse Eisenberg, Woody Harrelson, Emma Stone, ...	United States	November 1, 2019	2009	R	88 min	Comedies, Horror Movies	Looking to survive in a world taken over by zo...	88	<NA>
8805	s8806	Movie	Zoom	Peter Hewitt	Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...	United States	January 11, 2020	2006	PG	88 min	Children & Family Movies, Comedies	Dragged from civilian life, a former superhero...	88	<NA>
8806	s8807	Movie	Zubaan	Mozez Singh	Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...	India	March 2, 2019	2015	TV-14	111 min	Dramas, International Movies, Music & Musicals	A scrappy but poor boy worms his way into a ty...	111	<NA>

8807 rows × 14 columns

netflix_info_titles = movie_titles.merge(netflix_info)
netflix_info_titles

	movie_id	year	title	show_id	type	director	cast	country	date_added	MPAA_rating	duration	listed_in	description	duration_min	season
0	30	2003	Something's Gotta Give	s8056	Movie	Nancy Meyers	Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...	United States	August 1, 2019	PG-13	128 min	Comedies, Romantic Movies	Still sexy at 60, Harry Sanborn wines and dine...	128	<NA>
1	58	1996	Dragonheart	s6642	Movie	Rob Cohen	Sean Connery, Dennis Quaid, David Thewlis, Pet...	United States	January 1, 2020	PG-13	103 min	Action & Adventure, Sci-Fi & Fantasy	In ancient times when majestic fire-breathers ...	103	<NA>
2	77	1995	Congo	s568	Movie	Frank Marshall	Dylan Walsh, Laura Linney, Ernie Hudson, Tim C...	United States	July 1, 2021	PG-13	108 min	Action & Adventure, Thrillers	Eight people, some with ulterior motives, go o...	108	<NA>
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
406	17611	2002	The Legend of Bhagat Singh	s1867	Movie	Rajkumar Santoshi	Ajay Devgn, Sushant Singh, D. Santosh, Akhilen...	India	October 12, 2020	TV-14	156 min	Dramas, International Movies	This biopic chronicles the life and times of i...	156	<NA>
407	17621	1997	Tomorrow Never Dies	s8604	Movie	Roger Spottiswoode	Pierce Brosnan, Jonathan Pryce, Michelle Yeoh,...	United Kingdom, United States	December 31, 2019	PG-13	119 min	Action & Adventure	Pierce Brosnan stars in this 007 installment, ...	119	<NA>
408	17697	2004	New York Minute	s7580	Movie	Dennie Gordon	Mary-Kate Olsen, Ashley Olsen, Eugene Levy, An...	United States	January 1, 2020	PG	91 min	Children & Family Movies, Comedies	When bickering teenage twins accidentally land...	91	<NA>

409 rows × 15 columns

netflix_ratings_titles = pd.merge(ratings, netflix_info_titles)
netflix_ratings_titles

	movie_id	user_id	rating	date	year	title	show_id	type	director	cast	country	date_added	MPAA_rating	duration	listed_in	description	duration_min	season
0	1962	2191540	4	2004-06-23	2004	50 First Dates	s6019	Movie	Peter Segal	Adam Sandler, Drew Barrymore, Rob Schneider, S...	United States	December 1, 2020	PG-13	99 min	Comedies, Romantic Movies	After falling for a pretty art teacher who has...	99	<NA>
1	3427	998702	2	2005-08-26	2002	Men in Black II	s7444	Movie	Barry Sonnenfeld	Tommy Lee Jones, Will Smith, Rip Torn, Lara Fl...	United States	October 1, 2019	PG-13	88 min	Action & Adventure, Comedies, Sci-Fi & Fantasy	Will Smith and Tommy Lee Jones reprise their r...	88	<NA>
2	1962	1914163	4	2005-03-21	2004	50 First Dates	s6019	Movie	Peter Segal	Adam Sandler, Drew Barrymore, Rob Schneider, S...	United States	December 1, 2020	PG-13	99 min	Comedies, Romantic Movies	After falling for a pretty art teacher who has...	99	<NA>
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1246555	3638	1991791	3	2004-01-06	2003	Bad Boys II	s6213	Movie	Michael Bay	Will Smith, Martin Lawrence, Jordi Mollà, Gabr...	United States	October 1, 2019	R	147 min	Action & Adventure, Comedies	In this hyperkinetic sequel, a pair of Miami n...	147	<NA>
1246556	30	925565	4	2005-11-20	2003	Something's Gotta Give	s8056	Movie	Nancy Meyers	Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...	United States	August 1, 2019	PG-13	128 min	Comedies, Romantic Movies	Still sexy at 60, Harry Sanborn wines and dine...	128	<NA>
1246557	1615	811068	5	2004-08-26	1995	The American President	s8188	Movie	Rob Reiner	Michael Douglas, Annette Bening, Martin Sheen,...	United States	January 1, 2021	PG-13	113 min	Comedies, Dramas, Romantic Movies	The widowed president strikes up a romance wit...	113	<NA>

1246558 rows × 18 columns

netflix_ratings_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246558 entries, 0 to 1246557
Data columns (total 18 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   movie_id      1246558 non-null  Int16         
 1   user_id       1246558 non-null  Int32         
 2   rating        1246558 non-null  Int8          
 3   date          1246558 non-null  datetime64[ns]
 4   year          1246558 non-null  int64         
 5   title         1246558 non-null  object        
 6   show_id       1246558 non-null  object        
 7   type          1246558 non-null  object        
 8   director      1245701 non-null  object        
 9   cast          1246558 non-null  object        
 10  country       1246109 non-null  object        
 11  date_added    1246558 non-null  object        
 12  MPAA_rating   1246558 non-null  object        
 13  duration      1246558 non-null  object        
 14  listed_in     1246558 non-null  object        
 15  description   1246558 non-null  object        
 16  duration_min  1245701 non-null  Int32         
 17  season        857 non-null      Int32         
dtypes: Int16(1), Int32(3), Int8(1), datetime64[ns](1), int64(1), object(11)
memory usage: 147.4+ MB

netflix_ratings_titles.rename(columns={'year': 'release_year', 'duration_min': 'runtime', 'season': 'num_episodes', 'type': 'media_type', 'show_id': 'record_id', 'listed_in': 'genre', 'date': 'watch_date', 'user_id': 'viewer_id'}, inplace=True)

netflix_ratings_titles.drop(columns=['movie_id'], inplace=True)

genre = netflix_ratings_titles['genre']
genre

0                               Comedies, Romantic Movies
1          Action & Adventure, Comedies, Sci-Fi & Fantasy
2                               Comedies, Romantic Movies
                                ...                      
1246555                      Action & Adventure, Comedies
1246556                         Comedies, Romantic Movies
1246557                 Comedies, Dramas, Romantic Movies
Name: genre, Length: 1246558, dtype: object

genre_names = netflix_ratings_titles['genre'].str.split(',').explode().value_counts()
set(genre_names.index.str.strip())

{'Action & Adventure',
 'Anime Series',
 'Children & Family Movies',
 'Classic Movies',
 'Comedies',
 'Cult Movies',
 'Documentaries',
 'Dramas',
 'Horror Movies',
 'Independent Movies',
 'International Movies',
 "Kids' TV",
 'Music & Musicals',
 'Romantic Movies',
 'Sci-Fi & Fantasy',
 'Sports Movies',
 'TV Dramas',
 'Thrillers'}

# 각 행을 콤마로 분리하고, 각 요소의 앞뒤 공백 제거
genre_split = genre.str.split(',').apply(lambda x: [item.strip() for item in x])

genre_split

0                               [Comedies, Romantic Movies]
1          [Action & Adventure, Comedies, Sci-Fi & Fantasy]
2                               [Comedies, Romantic Movies]
                                 ...                       
1246555                      [Action & Adventure, Comedies]
1246556                         [Comedies, Romantic Movies]
1246557                 [Comedies, Dramas, Romantic Movies]
Name: genre, Length: 1246558, dtype: object

netflix_ratings_titles['genre'] = genre_split

netflix_ratings_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246558 entries, 0 to 1246557
Data columns (total 17 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   viewer_id     1246558 non-null  Int32         
 1   rating        1246558 non-null  Int8          
 2   watch_date    1246558 non-null  datetime64[ns]
 3   release_year  1246558 non-null  int64         
 4   title         1246558 non-null  object        
 5   record_id     1246558 non-null  object        
 6   media_type    1246558 non-null  object        
 7   director      1245701 non-null  object        
 8   cast          1246558 non-null  object        
 9   country       1246109 non-null  object        
 10  date_added    1246558 non-null  object        
 11  MPAA_rating   1246558 non-null  object        
 12  duration      1246558 non-null  object        
 13  genre         1246558 non-null  object        
 14  description   1246558 non-null  object        
 15  runtime       1245701 non-null  Int32         
 16  num_episodes  857 non-null      Int32         
dtypes: Int32(3), Int8(1), datetime64[ns](1), int64(1), object(11)
memory usage: 143.8+ MB

# convert to parquet
netflix_ratings_titles.to_parquet("data/netflix-prize-data-cleaned/netflix_ratings_titles.parquet")

# convert to json
netflix_ratings_titles2 = netflix_ratings_titles.copy()
netflix_ratings_titles2['watch_date'] = netflix_ratings_titles2['watch_date'].astype(str)
netflix_ratings_titles2.to_json("data/netflix-prize-data-cleaned/netflix_ratings_titles.json", orient="records")

Cleaned Data

netflix_ratings_titles = pd.read_parquet("data/netflix-prize-data-cleaned/netflix_ratings_titles.parquet")

netflix_ratings_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246558 entries, 0 to 1246557
Data columns (total 17 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   viewer_id     1246558 non-null  Int32         
 1   rating        1246558 non-null  Int8          
 2   watch_date    1246558 non-null  datetime64[ns]
 3   release_year  1246558 non-null  int64         
 4   title         1246558 non-null  object        
 5   record_id     1246558 non-null  object        
 6   media_type    1246558 non-null  object        
 7   director      1245701 non-null  object        
 8   cast          1246558 non-null  object        
 9   country       1246109 non-null  object        
 10  date_added    1246558 non-null  object        
 11  MPAA_rating   1246558 non-null  object        
 12  duration      1246558 non-null  object        
 13  genre         1246558 non-null  object        
 14  description   1246558 non-null  object        
 15  runtime       1245701 non-null  Int32         
 16  num_episodes  857 non-null      Int32         
dtypes: Int32(3), Int8(1), datetime64[ns](1), int64(1), object(11)
memory usage: 143.8+ MB

netflix_ratings_titles

	viewer_id	rating	watch_date	release_year	title	record_id	media_type	director	cast	country	date_added	MPAA_rating	duration	genre	description	runtime	num_episodes
0	2191540	4	2004-06-23	2004	50 First Dates	s6019	Movie	Peter Segal	Adam Sandler, Drew Barrymore, Rob Schneider, S...	United States	December 1, 2020	PG-13	99 min	[Comedies, Romantic Movies]	After falling for a pretty art teacher who has...	99	<NA>
1	998702	2	2005-08-26	2002	Men in Black II	s7444	Movie	Barry Sonnenfeld	Tommy Lee Jones, Will Smith, Rip Torn, Lara Fl...	United States	October 1, 2019	PG-13	88 min	[Action & Adventure, Comedies, Sci-Fi & Fantasy]	Will Smith and Tommy Lee Jones reprise their r...	88	<NA>
2	1914163	4	2005-03-21	2004	50 First Dates	s6019	Movie	Peter Segal	Adam Sandler, Drew Barrymore, Rob Schneider, S...	United States	December 1, 2020	PG-13	99 min	[Comedies, Romantic Movies]	After falling for a pretty art teacher who has...	99	<NA>
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1246555	1991791	3	2004-01-06	2003	Bad Boys II	s6213	Movie	Michael Bay	Will Smith, Martin Lawrence, Jordi Mollà, Gabr...	United States	October 1, 2019	R	147 min	[Action & Adventure, Comedies]	In this hyperkinetic sequel, a pair of Miami n...	147	<NA>
1246556	925565	4	2005-11-20	2003	Something's Gotta Give	s8056	Movie	Nancy Meyers	Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...	United States	August 1, 2019	PG-13	128 min	[Comedies, Romantic Movies]	Still sexy at 60, Harry Sanborn wines and dine...	128	<NA>
1246557	811068	5	2004-08-26	1995	The American President	s8188	Movie	Rob Reiner	Michael Douglas, Annette Bening, Martin Sheen,...	United States	January 1, 2021	PG-13	113 min	[Comedies, Dramas, Romantic Movies]	The widowed president strikes up a romance wit...	113	<NA>

1246558 rows × 17 columns

프로젝트 다운로드

project2-movie-analysis.zip

파일 구조

project2-movie-analysis/
├── movie_analysis.ipynb       # 메인 분석 노트북
├── media_classes.py           # Media, Movie, TVShow, ViewingRecord 클래스
├── analyzer.py                # ViewingAnalyzer 클래스
├── visualization.py           # 시각화 함수들
├── utils.py                   # 유틸리티 함수들
└── data/
    ├── netflix_ratings_titles.json      # 시청 기록 데이터 (JSON 형식)
    ├── netflix_ratings_titles.parquet   # 시청 기록 데이터 (parquet 형식)
    └── analysis_results.json  # 분석 결과 (자동 생성)