postgresqlimdb
IMDB Database
The import of the data was based on the script here: Import IMDB Datasets into Postgres
Run the below script as the postgres user to avoid permission issues.
printf "Removing and create folder \n" rm -rf imdb-datasets/ mkdir imdb-datasets/ printf "Download datasets from https://datasets.imdbws.com \n" cd imdb-datasets curl -O https://datasets.imdbws.com/name.basics.tsv.gz curl -O https://datasets.imdbws.com/title.akas.tsv.gz curl -O https://datasets.imdbws.com/title.basics.tsv.gz curl -O https://datasets.imdbws.com/title.crew.tsv.gz curl -O https://datasets.imdbws.com/title.episode.tsv.gz curl -O https://datasets.imdbws.com/title.principals.tsv.gz curl -O https://datasets.imdbws.com/title.ratings.tsv.gz printf "Unzipping datasets... \n" gzip -dk *.gz cd .. printf "Creating Database \n" psql -d 'postgres' -c "DROP DATABASE IF EXISTS imdb;" psql -d 'postgres' -c "CREATE DATABASE imdb;" printf "Creating tables in imdb database \n" psql -d imdb -c "CREATE table title_ratings (tconst VARCHAR(10),average_rating NUMERIC,num_votes integer);" psql -d imdb -c "CREATE TABLE name_basics (nconst varchar(10), primaryName text, birthYear smallint, deathYear smallint, primaryProfession text, knownForTitles text );" psql -d imdb -c "CREATE TABLE title_akas (titleId TEXT, ordering INTEGER, title TEXT, region TEXT, language TEXT, types TEXT, attributes TEXT, isOriginalTitle BOOLEAN);" psql -d imdb -c "CREATE TABLE title_basics (tconst TEXT, titleType TEXT, primaryTitle TEXT, originalTitle TEXT, isAdult BOOLEAN, startYear SMALLINT, endYear SMALLINT, runtimeMinutes INTEGER, genres TEXT);" psql -d imdb -c "CREATE TABLE title_crew (tconst TEXT, directors TEXT, writers TEXT);" psql -d imdb -c "CREATE TABLE title_episode (const TEXT, parentTconst TEXT, seasonNumber TEXT, episodeNumber TEXT);" psql -d imdb -c "CREATE TABLE title_principals (tconst TEXT, ordering INTEGER, nconst TEXT, category TEXT, job TEXT, characters TEXT);" printf "Inserting data into tables \n" psql -d imdb -c "COPY title_ratings FROM '$(pwd)/imdb-datasets/title.ratings.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY name_basics FROM '$(pwd)/imdb-datasets/name.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY title_akas FROM '$(pwd)/imdb-datasets/title.akas.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY title_basics FROM '$(pwd)/imdb-datasets/title.basics.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY title_crew FROM '$(pwd)/imdb-datasets/title.crew.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY title_episode FROM '$(pwd)/imdb-datasets/title.episode.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER" psql -d imdb -c "COPY title_principals FROM '$(pwd)/imdb-datasets/title.principals.tsv' DELIMITER E'\t' QUOTE E'\b' NULL '\N' CSV HEADER"
Next run the below SQL script to
- Create Primary Keys
- Fix Data Issues
- Create Foreign Keys
- Convert Column Lists to Arrays
ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_pk PRIMARY KEY (tconst); ALTER TABLE title_akas ADD CONSTRAINT title_akas_pk PRIMARY KEY (titleid, ordering); ALTER TABLE title_basics ADD CONSTRAINT title_basics_pk PRIMARY KEY (tconst); ALTER TABLE title_crew ADD CONSTRAINT title_crew_pk PRIMARY KEY (tconst); ALTER TABLE title_episode ADD CONSTRAINT title_episode_pk PRIMARY KEY (const); ALTER TABLE title_principals ADD CONSTRAINT title_principals_pk PRIMARY KEY (tconst, ordering); ALTER TABLE name_basics ADD CONSTRAINT name_basics_pk PRIMARY KEY (nconst); INSERT INTO name_basics (nconst,primaryname) VALUES ('nm2007716','Unknown'), ('nm10111415','Unknown'), ('nm15015978','Unknown'), ('nm13369741','Unknown'); ALTER TABLE title_ratings ADD CONSTRAINT title_ratings_fk FOREIGN KEY (tconst) REFERENCES title_basics (tconst); ALTER TABLE title_episode ADD CONSTRAINT title_episode_fk FOREIGN KEY (parenttconst) REFERENCES title_basics (tconst); ALTER TABLE title_principals ADD CONSTRAINT title_principals_fk FOREIGN KEY (nconst) REFERENCES name_basics (nconst); ALTER TABLE name_basics ALTER primaryprofession TYPE text[] USING string_to_array(primaryprofession, ','); ALTER TABLE name_basics ALTER knownfortitles TYPE text[] USING string_to_array(knownfortitles, ','); ALTER TABLE title_basics ALTER genres TYPE text[] USING string_to_array(genres, ','); ALTER TABLE title_crew ALTER directors TYPE text[] USING string_to_array(directors, ','); ALTER TABLE title_crew ALTER writers TYPE text[] USING string_to_array(writers, ',');
postgresqlimdb.txt · Last modified: 2024/04/11 12:56 by z0hpvk