User Tools

Site Tools


postgresqlimdb

IMDB Database

The import of the data was based on the script here: Import IMDB Datasets into Postgres
Run the below script as the postgres user to avoid permission issues.

printf "Removing and create folder \n"
rm -rf imdb-datasets/
mkdir imdb-datasets/
 
printf "Download datasets from https://datasets.imdbws.com \n"
cd imdb-datasets
curl -O https://datasets.imdbws.com/name.basics.tsv.gz
curl -O https://datasets.imdbws.com/title.akas.tsv.gz
curl -O https://datasets.imdbws.com/title.basics.tsv.gz
curl -O https://datasets.imdbws.com/title.crew.tsv.gz
curl -O https://datasets.imdbws.com/title.episode.tsv.gz
curl -O https://datasets.imdbws.com/title.principals.tsv.gz
curl -O https://datasets.imdbws.com/title.ratings.tsv.gz
 
printf "Unzipping datasets... \n"
gzip -dk *.gz
cd ..
 
printf "Creating Database \n"
psql -d 'postgres' -c "DROP DATABASE IF EXISTS imdb;"
psql -d 'postgres' -c "CREATE DATABASE imdb;"
 
printf "Creating tables in imdb database \n"
psql -d imdb -c "CREATE table title_ratings (tconst VARCHAR(10),average_rating NUMERIC,num_votes integer);"
psql -d imdb -c "CREATE TABLE name_basics (nconst varchar(10), primaryName text, birthYear smallint, deathYear smallint, primaryProfession text, knownForTitles text );"
psql -d imdb -c "CREATE TABLE title_akas (titleId TEXT, ordering INTEGER, title TEXT, region TEXT, language TEXT, types TEXT, attributes TEXT, isOriginalTitle BOOLEAN);"
psql -d imdb -c "CREATE TABLE title_basics (tconst TEXT, titleType TEXT, primaryTitle TEXT, originalTitle TEXT, isAdult BOOLEAN, startYear SMALLINT, endYear SMALLINT, runtimeMinutes INTEGER, genres TEXT);"
psql -d imdb -c "CREATE TABLE title_crew (tconst TEXT, directors TEXT, writers TEXT);"
psql -d imdb -c "CREATE TABLE title_episode (const TEXT, parentTconst TEXT, seasonNumber TEXT, episodeNumber TEXT);"
psql -d imdb -c "CREATE TABLE title_principals (tconst TEXT, ordering INTEGER, nconst TEXT, category TEXT, job TEXT, characters TEXT);"
 
printf "Inserting data into tables \n"
psql -d imdb -c "COPY title_ratings FROM '$(pwd)/imdb-datasets/title.ratings.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY name_basics FROM '$(pwd)/imdb-datasets/name.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY title_akas FROM '$(pwd)/imdb-datasets/title.akas.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY title_basics FROM '$(pwd)/imdb-datasets/title.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY title_crew FROM '$(pwd)/imdb-datasets/title.crew.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY title_episode FROM '$(pwd)/imdb-datasets/title.episode.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
psql -d imdb -c "COPY title_principals FROM '$(pwd)/imdb-datasets/title.principals.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"

Next run the below SQL script to

  • Create Primary Keys
  • Fix Data Issues
  • Create Foreign Keys
  • Convert Column Lists to Arrays
ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_pk PRIMARY KEY (tconst);
ALTER TABLE title_akas ADD CONSTRAINT title_akas_pk PRIMARY KEY (titleid, ordering);
ALTER TABLE title_basics ADD CONSTRAINT title_basics_pk PRIMARY KEY (tconst);
ALTER TABLE title_crew ADD CONSTRAINT title_crew_pk PRIMARY KEY (tconst);
ALTER TABLE title_episode ADD CONSTRAINT title_episode_pk PRIMARY KEY (const);
ALTER TABLE title_principals ADD CONSTRAINT title_principals_pk PRIMARY KEY (tconst, ordering);
ALTER TABLE name_basics ADD CONSTRAINT name_basics_pk PRIMARY KEY (nconst);
 
INSERT INTO name_basics (nconst,primaryname) VALUES ('nm2007716','Unknown'), ('nm10111415','Unknown'),
                                                    ('nm15015978','Unknown'), ('nm13369741','Unknown');
 
ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_fk FOREIGN KEY (tconst) REFERENCES title_basics (tconst);
ALTER TABLE title_episode ADD CONSTRAINT title_episode_fk FOREIGN KEY (parenttconst) REFERENCES title_basics (tconst);
ALTER TABLE title_principals ADD CONSTRAINT title_principals_fk FOREIGN KEY (nconst) REFERENCES name_basics (nconst);
 
ALTER TABLE name_basics ALTER primaryprofession TYPE text[] USING string_to_array(primaryprofession, ',');
ALTER TABLE name_basics ALTER knownfortitles TYPE text[] USING string_to_array(knownfortitles, ',');
ALTER TABLE title_basics ALTER genres TYPE text[] USING string_to_array(genres, ',');
ALTER TABLE title_crew ALTER directors TYPE text[] USING string_to_array(directors, ',');
ALTER TABLE title_crew ALTER writers TYPE text[] USING string_to_array(writers, ',');
postgresqlimdb.txt · Last modified: 2024/04/11 12:56 by z0hpvk