-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_imdb.py
214 lines (185 loc) · 7.82 KB
/
load_imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""Load IMDb TSV files
1 Download the files if they're not present
2 Load the TSV files a chunk at time
3 Clean the chunk
4 Write the chunk to either a CSV file or SQLite database table
"""
import os
import argparse
import re
import sqlite3 as sqlite
import pandas as pd
from tqdm import tqdm
from imdb_cleaner import (
IMDbCleaner,
TitleBasicsCleaner,
TitleCrewCleaner,
TitleEpisodeCleaner,
)
# Default configuration settings
SETTINGS = {
"input_dir": "import",
"output_format": "csv",
"output_dir": "export",
"db_file": "imdb.sqlite",
"chunksize": 10000,
}
CLEANER_CLASSES = {
"name.basics.tsv": IMDbCleaner,
"title.basics.tsv": TitleBasicsCleaner,
"title.crew.tsv": TitleCrewCleaner,
"title.episode.tsv": TitleEpisodeCleaner,
"title.principals.tsv": IMDbCleaner,
"title.ratings.tsv": IMDbCleaner,
# "title.akas.tsv": IMDbCleaner,
}
def tsv_load(tsv_file, chunksize):
"""Load the data from the TSV file in chunks."""
return pd.read_csv(tsv_file, sep="\t", chunksize=chunksize)
def sanitise_table_name(table_name):
"""Converts a string to a valid table name."""
# From Claude Haiku
# Remove special characters and spaces
table_name = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
# Convert to lowercase
table_name = table_name.lower()
# Truncate the name if necessary
max_length = 63 # Maximum table name length in SQLite
if len(table_name) > max_length:
table_name = table_name[:max_length]
return table_name
def df_to_csv(data_generator, csv_path):
"""Take rows from an IMDb TSV file, clean the rows and output a CSV file"""
with open(csv_path, "w", encoding="utf-8", newline="") as csv_file:
# Read the TSV file in chunks
# Write the data frames to a CSV file using a writer
for chunk in data_generator:
# csv_file.tell() returns the current position in the file of the `csv_file` pointer
if csv_file.tell() == 0:
# For the first chunk, overwrite the file and include a header
chunk.to_csv(csv_file, index=False, mode="w")
else:
# For subsequent chunks, append without writing the header
chunk.to_csv(csv_file, index=False, mode="a", header=False)
def df_to_sqlite(data_generator, db_path, db_table):
"""Take rows from an IMDb TSV file, clean the rows and output to a SQLite database"""
db_conn = sqlite.connect(db_path)
with db_conn:
# Read the TSV file in chunks
# Write the data frames to a SQLite table
for i, chunk in enumerate(data_generator):
if i == 0:
# For the first create or replace the table
chunk.to_sql(
db_table,
db_conn,
if_exists="replace",
index=False,
chunksize=SETTINGS["chunksize"],
)
else:
# For subsequent chunks, append without writing the header
chunk.to_sql(
db_table,
db_conn,
if_exists="append",
index=False,
chunksize=SETTINGS["chunksize"],
)
def main():
"""By default, read TSV files from the `import` directory, clean them up, convert them
to CSV files and write them to the `export` directory
Option to write to a SQLite database instead."""
# Create the export directory if it doesn't exist
os.makedirs(SETTINGS["output_dir"], exist_ok=True)
# Check command-line arguments
if SETTINGS["output_format"] == "sqlite":
if not SETTINGS["db_file"]:
raise ValueError(
"SQLite database file path must be provided when output format is 'sqlite'"
)
elif SETTINGS["output_format"] != "csv":
raise ValueError("Invalid output format: ", SETTINGS["output_format"])
# Format for progress bar
bar_format = "Progress: {l_bar}{bar} | Completed: {n_fmt} | Time: [{elapsed}]"
# Iterate through all TSV files with a progress bar
with tqdm(
total=len(CLEANER_CLASSES),
bar_format=bar_format,
desc="Processing files",
) as files_progress:
for tsv_name, cleaner_class in CLEANER_CLASSES.items():
tsv_path = os.path.join(SETTINGS["input_dir"], tsv_name)
# Create a generator to supply the TSV data in chunks as data frames
data_generator = tsv_load(tsv_path, SETTINGS["chunksize"])
# Create a generator to supply the cleaned data frames from a class based on the file
# names mapped in CLEANER_CLASSES
cleaner = cleaner_class(data_generator, tsv_name)
if SETTINGS["output_format"] == "csv":
# If output format is csv create CSV files
# Define the corresponding CSV file path in the export directory
csv_path = os.path.join(
SETTINGS["output_dir"], tsv_name.replace(".tsv", ".csv")
)
# Write the data frames as a CSV file
df_to_csv(data_generator=cleaner.clean_data(), csv_path=csv_path)
elif SETTINGS["output_format"] == "sqlite":
# If output format is SQLite create a SQLite database
db_path = os.path.join(SETTINGS["output_dir"], SETTINGS["db_file"])
# Define the corresponding table name in the SQLite database
db_table = "load_" + sanitise_table_name(os.path.splitext(tsv_name)[0])
# Write the data frames to a SQLite table
df_to_sqlite(
data_generator=cleaner.clean_data(),
db_path=db_path,
db_table=db_table,
)
files_progress.update(1)
print("Conversion complete!")
if __name__ == "__main__":
# TODO: review from @sfkleach
# The bottom of load_imdb.py has a if __name__ == "__main__": clause. A really good rule is to
# never declare any variables in this section. This rule works to prevent awkward clashes and as
# a rule-of-thumb that it has got too big.
# The arguments are unpacked from a namespace into the global variable SETTINGS, which is a
# dict. My suggestion is firstly, how about make SETTINGS a namespace and then do
# `SETTINGS = parser.parse_args()`?
# And I would be tempted to do away with a global SETTINGS by adding a class LoadIMDB: and
# stuffing all the methods into it. And then the global SETTINGS becomes an ordinary instance
# variable.
# Parse command-line arguments
parser = argparse.ArgumentParser(
description="Process TSV files from IMDb and save to CSV or SQLite."
)
parser.add_argument(
"input_dir",
type=str,
default="input",
help="Directory that contains the input TSV files",
)
parser.add_argument(
"--output_format",
type=str,
choices=["csv", "sqlite"],
default="csv",
help="Output format: csv or sqlite",
)
parser.add_argument(
"--db_file",
type=str,
default="imdb.sqlite",
help="Path to the SQLite database file (required if output is sqlite)",
)
parser.add_argument(
"--output_dir",
type=str,
default="export",
help="Path to the output CSV file (required if output is csv)",
)
args = parser.parse_args()
# Configuration settings
SETTINGS["input_dir"] = args.input_dir
SETTINGS["output_format"] = args.output_format
SETTINGS["output_dir"] = args.output_dir
SETTINGS["db_file"] = args.db_file
main()