Problem: Find top rated movie using HIVE and store the result to HDFS

1. Download data from below site.
https://datasets.imdbws.com/
2. Download the movies data title.ratings.tsv.gz and title.akas.tsv.gz
3. Find the top 50 rated movies with more than 100000 votes
4. Find the top 50 voted movies
5. Storage details
Columns: titleId,title,region,language,averageRating,numVotes
Store the result at below location: /home/cloudera/workspace/movies/hive/<formatname>
Store the result in following format.

a. Text file
Columns to be seperated with tab "\t"

b. Sequence file.

c. RC file.

d. Parquet.

e. ORC file.
Compression: SNAPPY

f. Avro file.

Use Hive to load data and output data to required location.

Pre work:

hadoop fs -mkdir /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -mkdir /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -put /home/cloudera/Downloads/movies/title.ratings.tsv.gz /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -ls /home/cloudera/workspace/movies/
Found 1 items
-rw-r--r-- 1 root supergroup 3984256 2018-02-18 15:26 /home/cloudera/workspace/movies/title.ratings.tsv.gz

val titleRating = sc.textFile("file:///home/cloudera/Downloads/movies/title.ratings.tsv.gz")
or
val titleRating = sc.textFile("/home/cloudera/workspace/movies/title.ratings.tsv.gz")

scala> titleRating.take(2)
res24: Array[String] = Array(tconst averageRating numVotes, tt0000001 5.8 1350)

[root@quickstart Downloads]# hadoop fs -put /home/cloudera/Downloads/movies/title.akas.tsv.gz /home/cloudera/workspace/movies
[root@quickstart Downloads]# hadoop fs -ls /home/cloudera/workspace/movies/
Found 2 items
-rw-r--r-- 1 root supergroup 52301827 2018-02-18 15:32 /home/cloudera/workspace/movies/title.akas.tsv.gz
-rw-r--r-- 1 root supergroup 3984256 2018-02-18 15:26 /home/cloudera/workspace/movies/title.ratings.tsv.gz

val title = sc.textFile("file:///home/cloudera/Downloads/movies/title.akas.tsv.gz")
or
val title = sc.textFile("/home/cloudera/workspace/movies/title.akas.tsv.gz")

scala> title.take(2)
res22: Array[String] = Array(titleId ordering title region language types attributes isOriginalTitle, tt0000001 1 Carmencita - spanyol tánc HU \N imdbDisplay \N 0)

Solution:
-----------------------------------------------------------------------------------------

First Load the data from local drive into HIVE.

drop table title;
CREATE TABLE title(
titleId string,
ordering string,
title string,
region string,
language string,
types string,
attributes string,
isOriginalTitle string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/title'
TBLPROPERTIES("skip.header.line.count"="1");

LOAD DATA LOCAL INPATH '/home/cloudera/Downloads/movies/title.akas.tsv.gz' OVERWRITE INTO TABLE title;

SELECT * from title limit 5;

drop table titlerating;
CREATE TABLE titlerating(
titleId string,
averageRating double,
numVotes int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/titlerating'
tblproperties("skip.header.line.count"="1");

LOAD DATA LOCAL INPATH '/home/cloudera/Downloads/movies/title.ratings.tsv.gz' OVERWRITE INTO TABLE titlerating;

SELECT * from titlerating limit 5;

---------------------------------------------------------------------------
3. Find the top 50 rated movies with more than 100000 votes

select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

a. Text file
Columns to be seperated with tab "\t"

CREATE TABLE top50moviestext
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = "\t"
)
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingtext'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

b. Sequence file.

CREATE TABLE top50moviesSequence
STORED AS SEQUENCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingsequence'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

c. RC file.

CREATE TABLE top50moviesrc
STORED AS RCFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingrc'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

d. Parquet.

CREATE TABLE top50moviesparquet
STORED AS PARQUET
LOCATION '/home/cloudera/workspace/movies/hive/averageratingparquet'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

e. ORC file.
Compression: SNAPPY

CREATE TABLE top50moviesrc
STORED AS ORCFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingorc'
TBLPROPERTIES("orc.compress"="SNAPPY")
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

f. Avro file.

CREATE TABLE top50moviesavro
STORED AS AVRO
LOCATION '/home/cloudera/workspace/movies/hive/averageratingavro'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;

---------------------------------------------------------------------------

4. Find the top 50 voted movies

select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

a. Text file
Columns to be seperated with tab "\t"

create table top50ratedmovietext
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/hive/topratedmovietext'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

b. Sequence file.

create table top50ratedmovieseq
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS SEQUENCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieseq'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

c. RC file.

create table top50ratedmovierc
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS RCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovierc'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

d. Parquet.

create table top50ratedmovieparquet
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieparquet'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

e. ORC file.
Compression: SNAPPY

create table top50ratedmovieorc
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieorc'
TBLPROPERTIES ("orc.compress"="SNAPPY")
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

f. Avro file.

create table top50ratedmovieavro
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS AVRO
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieavro'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;

Search This Blog

Apache Spark

Problem: Find top rated movie using HIVE and store the result to HDFS

Comments

Post a Comment

Popular posts from this blog

Conversion from one file format to other in Apache Spark

How to use (inner) JOIN and group by in apache spark SQL.