Analysis of Top rated Sci-fi movies on IMDB website using R-Language.

Introduction

The Internet Movie Database (IMDb) is a website that serves as an online database of world cinema. This website contains a large number of public data on films such as the title of the film, the year of release of the film, the genre of the film, the audience, the rating of critics, the duration of the film, the summary of the film, actors, directors and much more. Faced with the large amount of data available on this site, I thought that it would be interesting to analyze the movies data on the IMDb website on the top listed sci-fi movies.

Calling libraries

library(rvest)
## Warning: package 'rvest' was built under R version 3.6.1
## Loading required package: xml2
library(esquisse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Counter and placeholders

Created placeholders as empty vectors to use these later in the script to append the values.

pg <-c(0,51,101,151,201,251,301,351,401,451,501,551,601,651,701)
title_list <- c()
rank_list <- c()
genre_list <- c()
runtime_list <- c()
imdb_rating_list <- c()
director_list <- c()
actor_list <- c()
poster_list <- c()
metascore_list <- c()
votes_list <- c()
gross_list <- c()

Main code in loop to crawl all pages for Top scifi movie list

The code used to scrape the information from IMDB’s web page

for(i in pg) {
  url <- read_html(paste0("https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=sci-fi&sort=user_rating,desc&start=","i&ref_=adv_nxt"))
  
  #title
  title <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".lister-item-header a") %>%
    html_text()
  # Append them to the list
  title_list <- append(title_list, title)
  
  #rank
  rank <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".text-primary") %>%
    html_text()
  # Append them to the list
  rank_list <- append(rank_list, rank)
 
  #genre
  genre <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".genre") %>%
    html_text()
  # Append them to the list
  genre_list <- append(genre_list, genre)
 
  #runtime_list
  runtime <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".runtime") %>%
    html_text()
  # Append them to the list
  runtime_list <- append(runtime_list, runtime)
  
  #imdb rating
  imdb_rating <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".ratings-imdb-rating strong") %>%
    html_text()
  # Append them to the list
  imdb_rating_list <- append(imdb_rating_list, imdb_rating)
  
  #director
  director <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".text-muted+ p a:nth-child(1)") %>%
    html_text()
  # Append them to the list
  director_list <- append(director_list, director)
  
  #actor
  actor <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".lister-item-content .ghost~ a") %>%
    html_text()
  # Append them to the list
  actor_list <- append(actor_list, actor)
  
  #poster
  poster <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".loadlate") %>%
    html_attrs()
  # Append them to the list
  poster_list <- append(poster_list, poster)
  
  #metascore
  metascore <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".metascore") %>%
    html_text()
  # Append them to the list
  metascore_list <- append(metascore_list, metascore)
  
  #votes
  votes <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".sort-num_votes-visible span:nth-child(2)") %>%
    html_text()
  # Append them to the list
  votes_list <- append(votes_list, votes)
  
  #gross
  gross <- url %>%
    html_nodes(".mode-advanced") %>%
    html_node(".ghost~ .text-muted+ span") %>%
    html_text()
  # Append them to the list
  gross_list <- append(gross_list, gross)
}

Removing unwanted texts or symbols

runtime_list <- gsub(" min","",runtime_list)
genre_list <- gsub("\n","",genre_list)
genre_list <- gsub(",.*","",genre_list)
gross_list <- gsub("M","",gross_list)
gross_list <-substring(gross_list,2,6)
votes_list <- gsub(",","",votes_list)

Converting into numbers and factors

rank_list <- as.numeric(rank_list)
imdb_rating_list <- as.numeric(imdb_rating_list)
metascore_list <- as.numeric(metascore_list)
runtime_list <- as.numeric(runtime_list)
votes_list <- as.numeric(votes_list)
gross_list <- as.numeric(gross_list)

#convert as factor
# director_list <- as.factor(director_list)
# actor_list <- as.factor(actor_list)

Create a dataframe combining all above lists

imdb_toplist_scifi_df <- data.frame(Rank=rank_list, Title = title_list,
                                    Imdb_rating=imdb_rating_list,Metascore=metascore_list,
                                Runtime=runtime_list, Genre=genre_list,
                                Gross=gross_list, Votes=votes_list,
                                Director=director_list, Starcast=actor_list)

head(imdb_toplist_scifi_df)
##   Rank                                          Title Imdb_rating
## 1    1                                      Inception         8.8
## 2    2                                     The Matrix         8.7
## 3    3 Star Wars: Episode V - The Empire Strikes Back         8.7
## 4    4                                   Interstellar         8.6
## 5    5             Star Wars: Episode IV - A New Hope         8.6
## 6    6                              Avengers: Endgame         8.5
##   Metascore Runtime     Genre Gross   Votes          Director
## 1        74     148    Action 292.5 1887724 Christopher Nolan
## 2        73     136    Action 171.4 1549862    Lana Wachowski
## 3        82     124    Action 290.4 1075892    Irvin Kershner
## 4        74     169 Adventure 188.0 1343228 Christopher Nolan
## 5        90     121    Action 322.7 1144462      George Lucas
## 6        78     181    Action 858.3  591797     Anthony Russo
##              Starcast
## 1   Leonardo DiCaprio
## 2        Keanu Reeves
## 3         Mark Hamill
## 4 Matthew McConaughey
## 5         Mark Hamill
## 6   Robert Downey Jr.

Corelation between rank, imdb rating , gross and votes

mydata <- imdb_toplist_scifi_df[, c(1,3,7,8)]

#remove NAs
data <- na.omit(mydata)
corr <- cor(data)

corrplot(corr,type="lower",addCoef.col = "white")

Distribution of Votes for the top 702 sci-fi movies

The distribution of votes seems left skewed, which shows that people do not vote that much.

qplot(data = imdb_toplist_scifi_df, Votes, fill=Genre, bins = 30)

histogram of runtime

Looks uniformly distributed

qplot(data = imdb_toplist_scifi_df, Runtime, fill=Genre, bins = 30)

Bubble plot showcasing which Genre has the highest avg gross earning

Movies under genre Action,Adventure , Animation and Drama are the leaders in this section

ggplot(imdb_toplist_scifi_df, aes(x=Runtime, y=Gross, size = Imdb_rating, color= Genre)) +
    geom_point(alpha=0.5) +
    scale_size(range = c(2,9))
## Warning: Removed 75 rows containing missing values (geom_point).

Creating Category for classifying movies

imdb_toplist_scifi_df$Category <- ifelse((imdb_toplist_scifi_df$Imdb_rating < 7),"Ok-Movie",
                                         ifelse((imdb_toplist_scifi_df$Imdb_rating >=7 & imdb_toplist_scifi_df$Imdb_rating <8),"Good-Movie",
                                                ifelse((imdb_toplist_scifi_df$Imdb_rating>=8),"Great-Movie","NA")))
head(imdb_toplist_scifi_df)
##   Rank                                          Title Imdb_rating
## 1    1                                      Inception         8.8
## 2    2                                     The Matrix         8.7
## 3    3 Star Wars: Episode V - The Empire Strikes Back         8.7
## 4    4                                   Interstellar         8.6
## 5    5             Star Wars: Episode IV - A New Hope         8.6
## 6    6                              Avengers: Endgame         8.5
##   Metascore Runtime     Genre Gross   Votes          Director
## 1        74     148    Action 292.5 1887724 Christopher Nolan
## 2        73     136    Action 171.4 1549862    Lana Wachowski
## 3        82     124    Action 290.4 1075892    Irvin Kershner
## 4        74     169 Adventure 188.0 1343228 Christopher Nolan
## 5        90     121    Action 322.7 1144462      George Lucas
## 6        78     181    Action 858.3  591797     Anthony Russo
##              Starcast    Category
## 1   Leonardo DiCaprio Great-Movie
## 2        Keanu Reeves Great-Movie
## 3         Mark Hamill Great-Movie
## 4 Matthew McConaughey Great-Movie
## 5         Mark Hamill Great-Movie
## 6   Robert Downey Jr. Great-Movie

Actors_df with summarised counts and avg rating and gross

Actors_df <- imdb_toplist_scifi_df %>%
  group_by(Starcast,Category) %>%
  summarise(Tot_movies = n(), Avg_Rating = mean(Imdb_rating), Avg_gross = mean(Gross))

Actors_df
## # A tibble: 42 x 5
## # Groups:   Starcast [42]
##    Starcast              Category    Tot_movies Avg_Rating Avg_gross
##    <fct>                 <chr>            <int>      <dbl>     <dbl>
##  1 Alisa Freyndlikh      Great-Movie         15       8.2       0.23
##  2 Amy Adams             Good-Movie          15       7.9     100.  
##  3 Anna Paquin           Great-Movie         15       8        NA   
##  4 Arnold Schwarzenegger Great-Movie         30       8.25    122.  
##  5 Atsuko Tanaka         Great-Movie         15       8         0.52
##  6 Ben Burtt             Great-Movie         15       8.4     224.  
##  7 Brigitte Helm         Great-Movie         15       8.3       1.24
##  8 Bruce Willis          Great-Movie         15       8        57.1 
##  9 Bryan Cranston        Good-Movie          15       7.9      32.0 
## 10 Cem Yilmaz            Great-Movie         15       8        NA   
## # ... with 32 more rows

Directors_df with summarised counts and avg rating and gross

Directors_df <- imdb_toplist_scifi_df %>%
  group_by(Director,Category) %>%
  summarise(Tot_movies = n(), Avg_Rating = mean(Imdb_rating), Avg_gross = mean(Gross))

Directors_df
## # A tibble: 40 x 5
## # Groups:   Director [39]
##    Director              Category    Tot_movies Avg_Rating Avg_gross
##    <fct>                 <chr>            <int>      <dbl>     <dbl>
##  1 Andrei Tarkovsky      Great-Movie         30       8.15      NA  
##  2 Andrew Stanton        Great-Movie         15       8.4      224. 
##  3 Anthony Russo         Great-Movie         30       8.5      769. 
##  4 Bob Persichetti       Great-Movie         15       8.4      190. 
##  5 Brad Bird             Great-Movie         15       8         23.2
##  6 Bryan Singer          Great-Movie         15       8        234. 
##  7 Christopher Nolan     Great-Movie         45       8.63     178. 
##  8 Denis Villeneuve      Good-Movie          15       7.9      100. 
##  9 Denis Villeneuve      Great-Movie         15       8         92.0
## 10 Franklin J. Schaffner Great-Movie         15       8         33.4
## # ... with 30 more rows

Plot Actors_df and Directors_Df to check who make great movies

# plot_ly(Director_df, x = ~Tot_movies, y = ~Director, type = 'bar', color='Category',orientation = 'h')


ggplot(Directors_df, aes(reorder(Director,Tot_movies), y=Tot_movies, fill=Category)) +
  geom_bar(stat="identity")+ coord_flip()