Analysis of Top rated Sci-fi movies on IMDB website using R-Language.
Introduction
The Internet Movie Database (IMDb) is a website that serves as an online database of world cinema. This website contains a large number of public data on films such as the title of the film, the year of release of the film, the genre of the film, the audience, the rating of critics, the duration of the film, the summary of the film, actors, directors and much more. Faced with the large amount of data available on this site, I thought that it would be interesting to analyze the movies data on the IMDb website on the top listed sci-fi movies.
Calling libraries
library(rvest)
## Warning: package 'rvest' was built under R version 3.6.1
## Loading required package: xml2
library(esquisse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.1
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Counter and placeholders
Created placeholders as empty vectors to use these later in the script to append the values.
pg <-c(0,51,101,151,201,251,301,351,401,451,501,551,601,651,701)
title_list <- c()
rank_list <- c()
genre_list <- c()
runtime_list <- c()
imdb_rating_list <- c()
director_list <- c()
actor_list <- c()
poster_list <- c()
metascore_list <- c()
votes_list <- c()
gross_list <- c()
Main code in loop to crawl all pages for Top scifi movie list
The code used to scrape the information from IMDB’s web page
for(i in pg) {
url <- read_html(paste0("https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=sci-fi&sort=user_rating,desc&start=","i&ref_=adv_nxt"))
#title
title <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".lister-item-header a") %>%
html_text()
# Append them to the list
title_list <- append(title_list, title)
#rank
rank <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".text-primary") %>%
html_text()
# Append them to the list
rank_list <- append(rank_list, rank)
#genre
genre <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".genre") %>%
html_text()
# Append them to the list
genre_list <- append(genre_list, genre)
#runtime_list
runtime <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".runtime") %>%
html_text()
# Append them to the list
runtime_list <- append(runtime_list, runtime)
#imdb rating
imdb_rating <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".ratings-imdb-rating strong") %>%
html_text()
# Append them to the list
imdb_rating_list <- append(imdb_rating_list, imdb_rating)
#director
director <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".text-muted+ p a:nth-child(1)") %>%
html_text()
# Append them to the list
director_list <- append(director_list, director)
#actor
actor <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".lister-item-content .ghost~ a") %>%
html_text()
# Append them to the list
actor_list <- append(actor_list, actor)
#poster
poster <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".loadlate") %>%
html_attrs()
# Append them to the list
poster_list <- append(poster_list, poster)
#metascore
metascore <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".metascore") %>%
html_text()
# Append them to the list
metascore_list <- append(metascore_list, metascore)
#votes
votes <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".sort-num_votes-visible span:nth-child(2)") %>%
html_text()
# Append them to the list
votes_list <- append(votes_list, votes)
#gross
gross <- url %>%
html_nodes(".mode-advanced") %>%
html_node(".ghost~ .text-muted+ span") %>%
html_text()
# Append them to the list
gross_list <- append(gross_list, gross)
}
Removing unwanted texts or symbols
runtime_list <- gsub(" min","",runtime_list)
genre_list <- gsub("\n","",genre_list)
genre_list <- gsub(",.*","",genre_list)
gross_list <- gsub("M","",gross_list)
gross_list <-substring(gross_list,2,6)
votes_list <- gsub(",","",votes_list)
Converting into numbers and factors
rank_list <- as.numeric(rank_list)
imdb_rating_list <- as.numeric(imdb_rating_list)
metascore_list <- as.numeric(metascore_list)
runtime_list <- as.numeric(runtime_list)
votes_list <- as.numeric(votes_list)
gross_list <- as.numeric(gross_list)
#convert as factor
# director_list <- as.factor(director_list)
# actor_list <- as.factor(actor_list)
Create a dataframe combining all above lists
imdb_toplist_scifi_df <- data.frame(Rank=rank_list, Title = title_list,
Imdb_rating=imdb_rating_list,Metascore=metascore_list,
Runtime=runtime_list, Genre=genre_list,
Gross=gross_list, Votes=votes_list,
Director=director_list, Starcast=actor_list)
head(imdb_toplist_scifi_df)
## Rank Title Imdb_rating
## 1 1 Inception 8.8
## 2 2 The Matrix 8.7
## 3 3 Star Wars: Episode V - The Empire Strikes Back 8.7
## 4 4 Interstellar 8.6
## 5 5 Star Wars: Episode IV - A New Hope 8.6
## 6 6 Avengers: Endgame 8.5
## Metascore Runtime Genre Gross Votes Director
## 1 74 148 Action 292.5 1887724 Christopher Nolan
## 2 73 136 Action 171.4 1549862 Lana Wachowski
## 3 82 124 Action 290.4 1075892 Irvin Kershner
## 4 74 169 Adventure 188.0 1343228 Christopher Nolan
## 5 90 121 Action 322.7 1144462 George Lucas
## 6 78 181 Action 858.3 591797 Anthony Russo
## Starcast
## 1 Leonardo DiCaprio
## 2 Keanu Reeves
## 3 Mark Hamill
## 4 Matthew McConaughey
## 5 Mark Hamill
## 6 Robert Downey Jr.
Corelation between rank, imdb rating , gross and votes
mydata <- imdb_toplist_scifi_df[, c(1,3,7,8)]
#remove NAs
data <- na.omit(mydata)
corr <- cor(data)
corrplot(corr,type="lower",addCoef.col = "white")

Distribution of Votes for the top 702 sci-fi movies
The distribution of votes seems left skewed, which shows that people do not vote that much.
qplot(data = imdb_toplist_scifi_df, Votes, fill=Genre, bins = 30)

histogram of runtime
Looks uniformly distributed
qplot(data = imdb_toplist_scifi_df, Runtime, fill=Genre, bins = 30)

Bubble plot showcasing which Genre has the highest avg gross earning
Movies under genre Action,Adventure , Animation and Drama are the leaders in this section
ggplot(imdb_toplist_scifi_df, aes(x=Runtime, y=Gross, size = Imdb_rating, color= Genre)) +
geom_point(alpha=0.5) +
scale_size(range = c(2,9))
## Warning: Removed 75 rows containing missing values (geom_point).

Creating Category for classifying movies
imdb_toplist_scifi_df$Category <- ifelse((imdb_toplist_scifi_df$Imdb_rating < 7),"Ok-Movie",
ifelse((imdb_toplist_scifi_df$Imdb_rating >=7 & imdb_toplist_scifi_df$Imdb_rating <8),"Good-Movie",
ifelse((imdb_toplist_scifi_df$Imdb_rating>=8),"Great-Movie","NA")))
head(imdb_toplist_scifi_df)
## Rank Title Imdb_rating
## 1 1 Inception 8.8
## 2 2 The Matrix 8.7
## 3 3 Star Wars: Episode V - The Empire Strikes Back 8.7
## 4 4 Interstellar 8.6
## 5 5 Star Wars: Episode IV - A New Hope 8.6
## 6 6 Avengers: Endgame 8.5
## Metascore Runtime Genre Gross Votes Director
## 1 74 148 Action 292.5 1887724 Christopher Nolan
## 2 73 136 Action 171.4 1549862 Lana Wachowski
## 3 82 124 Action 290.4 1075892 Irvin Kershner
## 4 74 169 Adventure 188.0 1343228 Christopher Nolan
## 5 90 121 Action 322.7 1144462 George Lucas
## 6 78 181 Action 858.3 591797 Anthony Russo
## Starcast Category
## 1 Leonardo DiCaprio Great-Movie
## 2 Keanu Reeves Great-Movie
## 3 Mark Hamill Great-Movie
## 4 Matthew McConaughey Great-Movie
## 5 Mark Hamill Great-Movie
## 6 Robert Downey Jr. Great-Movie
Actors_df with summarised counts and avg rating and gross
Actors_df <- imdb_toplist_scifi_df %>%
group_by(Starcast,Category) %>%
summarise(Tot_movies = n(), Avg_Rating = mean(Imdb_rating), Avg_gross = mean(Gross))
Actors_df
## # A tibble: 42 x 5
## # Groups: Starcast [42]
## Starcast Category Tot_movies Avg_Rating Avg_gross
## <fct> <chr> <int> <dbl> <dbl>
## 1 Alisa Freyndlikh Great-Movie 15 8.2 0.23
## 2 Amy Adams Good-Movie 15 7.9 100.
## 3 Anna Paquin Great-Movie 15 8 NA
## 4 Arnold Schwarzenegger Great-Movie 30 8.25 122.
## 5 Atsuko Tanaka Great-Movie 15 8 0.52
## 6 Ben Burtt Great-Movie 15 8.4 224.
## 7 Brigitte Helm Great-Movie 15 8.3 1.24
## 8 Bruce Willis Great-Movie 15 8 57.1
## 9 Bryan Cranston Good-Movie 15 7.9 32.0
## 10 Cem Yilmaz Great-Movie 15 8 NA
## # ... with 32 more rows
Directors_df with summarised counts and avg rating and gross
Directors_df <- imdb_toplist_scifi_df %>%
group_by(Director,Category) %>%
summarise(Tot_movies = n(), Avg_Rating = mean(Imdb_rating), Avg_gross = mean(Gross))
Directors_df
## # A tibble: 40 x 5
## # Groups: Director [39]
## Director Category Tot_movies Avg_Rating Avg_gross
## <fct> <chr> <int> <dbl> <dbl>
## 1 Andrei Tarkovsky Great-Movie 30 8.15 NA
## 2 Andrew Stanton Great-Movie 15 8.4 224.
## 3 Anthony Russo Great-Movie 30 8.5 769.
## 4 Bob Persichetti Great-Movie 15 8.4 190.
## 5 Brad Bird Great-Movie 15 8 23.2
## 6 Bryan Singer Great-Movie 15 8 234.
## 7 Christopher Nolan Great-Movie 45 8.63 178.
## 8 Denis Villeneuve Good-Movie 15 7.9 100.
## 9 Denis Villeneuve Great-Movie 15 8 92.0
## 10 Franklin J. Schaffner Great-Movie 15 8 33.4
## # ... with 30 more rows
Plot Actors_df and Directors_Df to check who make great movies
# plot_ly(Director_df, x = ~Tot_movies, y = ~Director, type = 'bar', color='Category',orientation = 'h')
ggplot(Directors_df, aes(reorder(Director,Tot_movies), y=Tot_movies, fill=Category)) +
geom_bar(stat="identity")+ coord_flip()
