Python Programming
  • Home
  • Intro
    • History & Background
    • Python Setup
  • QPB
    • Part I: Chapter 1-3
    • Part II
    • 5. Lists, Tuples, Sets
  • Exercises
    • Chapter 5: Lists, Tuples, Sets
    • Chapter 6: Strings
    • Chapter 7: Dictionaries
    • Chapter 8: Control flow
    • Chapter 9: Functions
    • Chapter 14: Exceptions
    • Chapter 15: Classes
  • Exploring Data
    • NumPy & pandas
    • Visualization
  • Library System
  • Netflix Movie Analysis
    • Notes
    • Project-Native
    • Project-pandas
  • References
    • QPB Part 1
    • QPB Part 2
    • QPB Part 3
    • QPB Part 4

On this page

  • 파일 다운로드
  • 파일 읽기
    • 표준입력으로 읽기
    • csv 모듈로 읽기
    • pandas로 읽기

Project: MovieLens 32M

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.3f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display
pd.options.display.notebook_repr_html = True  # display html in notebook

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# matplotlib options
from matplotlib import style
theme_dict = {**style.library['ggplot'], "grid.linestyle": ":", 'axes.facecolor': 'white', 'grid.color': '.6',}
so.Plot.config.theme.update(theme_dict)

# theme_dict = {**sns.axes_style("whitegrid"), "grid.linestyle": ":"}
# so.Plot.config.theme.update(theme_dict)

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")
  • 32 million movie ratings
  • 2 million tag applied to 87,585 movies by 200,948 users.
  • Collected 10/2023, released 05/2024

MovieLens 32M 링크

파일 다운로드

from urllib.request import urlretrieve
from pathlib import Path
import zipfile

# 다운로드할 zip 파일 URL
zip_url = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"

# 저장할 경로 설정
download_dir = Path.cwd() / "data"  # 현재 작업 디렉토리 / data 폴더
download_dir.mkdir(parents=True, exist_ok=True)

zip_file_path = download_dir / "downloaded_file.zip"  # 다운로드할 파일 경로

# 파일 다운로드
urlretrieve(zip_url, zip_file_path)

# zip 파일 압축 해제
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(download_dir)

# 다운로드한 zip 파일 삭제
zip_file_path.unlink()
print(f"Deleted zip file: {zip_file_path}")

파일 살펴보기

# "cat" 명령어로 movies.csv의 첫 5줄 보기
!cat data/ml-32m/movies.csv | head -n 5
movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
cat: stdout: Broken pipe

파일 읽기

표준입력으로 읽기

with open("data/ml-32m/movies.csv") as f:
    for _ in range(5):
        print(f.readline(), end='')
movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance

csv 모듈로 읽기

import csv

with open("data/ml-32m/movies.csv") as f:
    reader = csv.reader(f)
    for _ in range(5):
        print(next(reader))
['movieId', 'title', 'genres']
['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy']
['3', 'Grumpier Old Men (1995)', 'Comedy|Romance']
['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance']
from csv import DictReader

results = [
    fields for fields in DictReader(open("data/ml-32m/movies.csv"))
]
results[:5]
[{'movieId': '1',
  'title': 'Toy Story (1995)',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy'},
 {'movieId': '2',
  'title': 'Jumanji (1995)',
  'genres': 'Adventure|Children|Fantasy'},
 {'movieId': '3',
  'title': 'Grumpier Old Men (1995)',
  'genres': 'Comedy|Romance'},
 {'movieId': '4',
  'title': 'Waiting to Exhale (1995)',
  'genres': 'Comedy|Drama|Romance'},
 {'movieId': '5',
  'title': 'Father of the Bride Part II (1995)',
  'genres': 'Comedy'}]

pandas로 읽기

movies = pd.read_csv("data/ml-32m/movies.csv")
ratings = pd.read_csv("data/ml-32m/ratings.csv")
tags = pd.read_csv("data/ml-32m/tags.csv")
# 각 데이터의 메모리 사이즈 확인(MB)
print(f"movies 데이터의 메모리 사이즈: {movies.memory_usage().sum() / 1024**2:.2f} MB")
print(f"ratings 데이터의 메모리 사이즈: {ratings.memory_usage().sum() / 1024**2:.2f} MB")
print(f"tags 데이터의 메모리 사이즈: {tags.memory_usage().sum() / 1024**2:.2f} MB")
movies 데이터의 메모리 사이즈: 2.00 MB
ratings 데이터의 메모리 사이즈: 976.57 MB
tags 데이터의 메모리 사이즈: 61.04 MB
parquet로 저장하기
  • ratings, movies, tags를 parquet 파일로 저장
  • pyarrow 필요: pip install pyarrow
ratings.to_parquet("data/ml-32m/ratings.parquet")
movies.to_parquet("data/ml-32m/movies.parquet")
tags.to_parquet("data/ml-32m/tags.parquet")
movies.head()
movieId title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
4 5 Father of the Bride Part II (1995) Comedy
ratings.head()
userId movieId rating timestamp
0 1 17 4.000 944249077
1 1 25 1.000 944250228
2 1 29 2.000 943230976
3 1 30 5.000 944249077
4 1 32 5.000 943228858
tags.head()
userId movieId tag timestamp
0 22 26479 Kevin Kline 1583038886
1 22 79592 misogyny 1581476297
2 22 247150 acrophobia 1622483469
3 34 2174 music 1249808064
4 34 2174 weird 1249808102

This work © 2025 by Sungkyun Cho is licensed under CC BY-NC-SA 4.0