Problem: Find top rated movie using HIVE and store the result to HDFS
1. Download data from below site.
https://datasets.imdbws.com/
2. Download the movies data title.ratings.tsv.gz and title.akas.tsv.gz
3. Find the top 50 rated movies with more than 100000 votes
4. Find the top 50 voted movies
5. Storage details
Columns: titleId,title,region,language,averageRating,numVotes
Store the result at below location: /home/cloudera/workspace/movies/hive/<formatname>
Store the result in following format.
a. Text file
Columns to be seperated with tab "\t"
b. Sequence file.
c. RC file.
d. Parquet.
e. ORC file.
Compression: SNAPPY
f. Avro file.
Use Hive to load data and output data to required location.
Pre work:
hadoop fs -mkdir /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -mkdir /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -put /home/cloudera/Downloads/movies/title.ratings.tsv.gz /home/cloudera/workspace/movies
[root@quickstart movies]# hadoop fs -ls /home/cloudera/workspace/movies/
Found 1 items
-rw-r--r-- 1 root supergroup 3984256 2018-02-18 15:26 /home/cloudera/workspace/movies/title.ratings.tsv.gz
val titleRating = sc.textFile("file:///home/cloudera/Downloads/movies/title.ratings.tsv.gz")
or
val titleRating = sc.textFile("/home/cloudera/workspace/movies/title.ratings.tsv.gz")
scala> titleRating.take(2)
res24: Array[String] = Array(tconst averageRating numVotes, tt0000001 5.8 1350)
[root@quickstart Downloads]# hadoop fs -put /home/cloudera/Downloads/movies/title.akas.tsv.gz /home/cloudera/workspace/movies
[root@quickstart Downloads]# hadoop fs -ls /home/cloudera/workspace/movies/
Found 2 items
-rw-r--r-- 1 root supergroup 52301827 2018-02-18 15:32 /home/cloudera/workspace/movies/title.akas.tsv.gz
-rw-r--r-- 1 root supergroup 3984256 2018-02-18 15:26 /home/cloudera/workspace/movies/title.ratings.tsv.gz
val title = sc.textFile("file:///home/cloudera/Downloads/movies/title.akas.tsv.gz")
or
val title = sc.textFile("/home/cloudera/workspace/movies/title.akas.tsv.gz")
scala> title.take(2)
res22: Array[String] = Array(titleId ordering title region language types attributes isOriginalTitle, tt0000001 1 Carmencita - spanyol tánc HU \N imdbDisplay \N 0)
Solution:
-----------------------------------------------------------------------------------------
First Load the data from local drive into HIVE.
drop table title;
CREATE TABLE title(
titleId string,
ordering string,
title string,
region string,
language string,
types string,
attributes string,
isOriginalTitle string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/title'
TBLPROPERTIES("skip.header.line.count"="1");
LOAD DATA LOCAL INPATH '/home/cloudera/Downloads/movies/title.akas.tsv.gz' OVERWRITE INTO TABLE title;
SELECT * from title limit 5;
drop table titlerating;
CREATE TABLE titlerating(
titleId string,
averageRating double,
numVotes int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/titlerating'
tblproperties("skip.header.line.count"="1");
LOAD DATA LOCAL INPATH '/home/cloudera/Downloads/movies/title.ratings.tsv.gz' OVERWRITE INTO TABLE titlerating;
SELECT * from titlerating limit 5;
---------------------------------------------------------------------------
3. Find the top 50 rated movies with more than 100000 votes
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
a. Text file
Columns to be seperated with tab "\t"
CREATE TABLE top50moviestext
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = "\t"
)
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingtext'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
b. Sequence file.
CREATE TABLE top50moviesSequence
STORED AS SEQUENCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingsequence'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
c. RC file.
CREATE TABLE top50moviesrc
STORED AS RCFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingrc'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
d. Parquet.
CREATE TABLE top50moviesparquet
STORED AS PARQUET
LOCATION '/home/cloudera/workspace/movies/hive/averageratingparquet'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
e. ORC file.
Compression: SNAPPY
CREATE TABLE top50moviesrc
STORED AS ORCFILE
LOCATION '/home/cloudera/workspace/movies/hive/averageratingorc'
TBLPROPERTIES("orc.compress"="SNAPPY")
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
f. Avro file.
CREATE TABLE top50moviesavro
STORED AS AVRO
LOCATION '/home/cloudera/workspace/movies/hive/averageratingavro'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
where numVotes>100000 order by averageRating desc limit 50;
---------------------------------------------------------------------------
4. Find the top 50 voted movies
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
a. Text file
Columns to be seperated with tab "\t"
create table top50ratedmovietext
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE
LOCATION '/home/cloudera/workspace/movies/hive/topratedmovietext'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
b. Sequence file.
create table top50ratedmovieseq
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS SEQUENCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieseq'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
c. RC file.
create table top50ratedmovierc
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS RCEFILE
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovierc'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
d. Parquet.
create table top50ratedmovieparquet
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieparquet'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
e. ORC file.
Compression: SNAPPY
create table top50ratedmovieorc
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieorc'
TBLPROPERTIES ("orc.compress"="SNAPPY")
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
f. Avro file.
create table top50ratedmovieavro
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS AVRO
LOCATION '/home/cloudera/workspace/movies/hive/top50ratedmovieavro'
AS
select title.titleId,title,region,language,averageRating,numVotes from title inner join titlerating on (title.titleId=titlerating.titleId)
order by numVotes desc limit 50;
awesome post presented by you..your writing style is fabulous and keep update with your blogs Big data hadoop online Course Hyderabad
ReplyDeletesmm panel
ReplyDeleteSMM PANEL
İs ilanlari
instagram takipçi satın al
hırdavatçı
HTTPS://WWW.BEYAZESYATEKNİKSERVİSİ.COM.TR/
Servis
TİKTOK HİLE