Project: MovieLens 32M

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display
pd.options.display.notebook_repr_html = True  # display html in notebook

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

32 million movie ratings
2 million tag applied to 87,585 movies by 200,948 users.
Collected 10/2023, released 05/2024

MovieLens 32M 링크

파일 다운로드

from urllib.request import urlretrieve
from pathlib import Path
import zipfile

# 다운로드할 zip 파일 URL
zip_url = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"

# 저장할 경로 설정
download_dir = Path.cwd() / "data"  # 현재 작업 디렉토리 / data 폴더
download_dir.mkdir(parents=True, exist_ok=True)

zip_file_path = download_dir / "downloaded_file.zip"  # 다운로드할 파일 경로

# 파일 다운로드
urlretrieve(zip_url, zip_file_path)

# zip 파일 압축 해제
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(download_dir)

# 다운로드한 zip 파일 삭제
zip_file_path.unlink()
print(f"Deleted zip file: {zip_file_path}")

파일 살펴보기

# "cat" 명령어로 movies.csv의 첫 5줄 보기
!cat data/ml-32m/movies.csv | head -n 5

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
cat: stdout: Broken pipe

파일 읽기

표준입력으로 읽기

with open("data/ml-32m/movies.csv") as f:
    for _ in range(5):
        print(f.readline(), end='')

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance

csv 모듈로 읽기

import csv

with open("data/ml-32m/movies.csv") as f:
    reader = csv.reader(f)
    for _ in range(5):
        print(next(reader))

['movieId', 'title', 'genres']
['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy']
['3', 'Grumpier Old Men (1995)', 'Comedy|Romance']
['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance']

from csv import DictReader

results = [
    fields for fields in DictReader(open("data/ml-32m/movies.csv"))
]
results[:5]

[{'movieId': '1',
  'title': 'Toy Story (1995)',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy'},
 {'movieId': '2',
  'title': 'Jumanji (1995)',
  'genres': 'Adventure|Children|Fantasy'},
 {'movieId': '3',
  'title': 'Grumpier Old Men (1995)',
  'genres': 'Comedy|Romance'},
 {'movieId': '4',
  'title': 'Waiting to Exhale (1995)',
  'genres': 'Comedy|Drama|Romance'},
 {'movieId': '5',
  'title': 'Father of the Bride Part II (1995)',
  'genres': 'Comedy'}]

pandas로 읽기

movies = pd.read_csv("data/ml-32m/movies.csv")
ratings = pd.read_csv("data/ml-32m/ratings.csv")
tags = pd.read_csv("data/ml-32m/tags.csv")

# 각 데이터의 메모리 사이즈 확인(MB)
print(f"movies 데이터의 메모리 사이즈: {movies.memory_usage().sum() / 1024**2:.2f} MB")
print(f"ratings 데이터의 메모리 사이즈: {ratings.memory_usage().sum() / 1024**2:.2f} MB")
print(f"tags 데이터의 메모리 사이즈: {tags.memory_usage().sum() / 1024**2:.2f} MB")

movies 데이터의 메모리 사이즈: 2.00 MB
ratings 데이터의 메모리 사이즈: 976.57 MB
tags 데이터의 메모리 사이즈: 61.04 MB

parquet로 저장하기

ratings, movies, tags를 parquet 파일로 저장
pyarrow 필요: pip install pyarrow

ratings.to_parquet("data/ml-32m/ratings.parquet")
movies.to_parquet("data/ml-32m/movies.parquet")
tags.to_parquet("data/ml-32m/tags.parquet")

movies.head()

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

ratings.head()

	userId	movieId	rating	timestamp
0	1	17	4.000	944249077
1	1	25	1.000	944250228
2	1	29	2.000	943230976
3	1	30	5.000	944249077
4	1	32	5.000	943228858

tags.head()

	userId	movieId	tag	timestamp
0	22	26479	Kevin Kline	1583038886
1	22	79592	misogyny	1581476297
2	22	247150	acrophobia	1622483469
3	34	2174	music	1249808064
4	34	2174	weird	1249808102