SlideShare a Scribd company logo
1
Connect With Us
Website ( )
Free Online R Courses ( )
R Packages ( )
Shiny Apps ( )
Blog ( )
GitHub ( )
YouTube ( )
Twitter ( )
Facebook ( )
Linkedin ( )
• https://www.rsquaredacademy.com/
• https://rsquared-academy.thinkific.com/
• https://pkgs.rsquaredacademy.com
• https://apps.rsquaredacademy.com
• https://blog.rsquaredacademy.com
• https://github.com/rsquaredacademy
• https://www.youtube.com/user/rsquaredin/
• https://twitter.com/rsquaredacademy
• https://www.facebook.com/rsquaredacademy/
• https://in.linkedin.com/company/rsquared-academy
2
what?
why?
how?
use cases
HTML basics
case studies
•
•
•
•
•
•
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Libraries
library(robotstxt)
library(rvest)
library(selectr)
library(xml2)
library(dplyr)
library(stringr)
library(forcats)
library(magrittr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(tibble)
library(purrr)
20
21
robotstxt
paths_allowed(
paths = c("https://www.imdb.com/search/title?groups=top_250&sort=user_
)
##
www.imdb.com No encoding supplied: defaulting to U
## [1] TRUE
22
Read Web Page
imdb <- read_html("https://www.imdb.com/search/title?groups=top_250&sort
imdb
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/
## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars
## [2] <body id="styleguide-v2" class="fixed">nn <img heigh
23
24
Title
imdb %>%
html_nodes(".lister-item-content h3 a") %>%
html_text() -> movie_title
movie_title
## [1] "The Shawshank Redemption"
## [2] "The Godfather"
## [3] "The Dark Knight"
## [4] "The Godfather: Part II"
## [5] "The Lord of the Rings: The Return of the King"
## [6] "Pulp Fiction"
## [7] "Schindler's List"
## [8] "Il buono, il brutto, il cattivo"
## [9] "12 Angry Men"
## [10] "Inception"
## [11] "Fight Club"
## [12] "The Lord of the Rings: The Fellowship of the Ring"
## [13] "Forrest Gump"
## [14] "The Lord of the Rings: The Two Towers"
## [15] "The Matrix"
## [16] "Goodfellas"
## [17] "Star Wars: Episode V - The Empire Strikes Back"
25
26
Year of Release
imdb %>%
html_nodes(".lister-item-content h3 .lister-item-year") %>%
html_text() %>%
str_sub(start = 2, end = 5) %>%
as.Date(format = "%Y") %>%
year() -> movie_year
movie_year
## [1] 1994 1972 2008 1974 2003 1994 1993 1966 1957 2010 1999 2001 1994
## [15] 1999 1990 1980 1975 1954 2014 2002 2001 1998 1999 1997 1995 1995
## [29] 1991 1977 1946 2018 2016 2018 2018 2014 2011 2006 2006 2002 2000
## [43] 1998 1994 1991 1988 1988 1985 1981 1979
27
28
Certificate
imdb %>%
html_nodes(".lister-item-content p .certificate") %>%
html_text() -> movie_certificate
movie_certificate
## [1] "A" "A" "UA" "PG-13" "A" "A" "UA" "A"
## [9] "PG-13" "PG-13" "PG-13" "A" "A" "PG" "UA" "R"
## [17] "PG" "A" "A" "PG-13" "A" "R" "A" "A"
## [25] "U" "PG" "UA" "U" "U" "UA" "A" "UA"
## [33] "PG-13" "A" "R" "R" "R" "A" "U" "U"
## [41] "R" "U" "PG" "R"
29
30
Runtime
imdb %>%
html_nodes(".lister-item-content p .runtime") %>%
html_text() %>%
str_split(" ") %>%
map_chr(1) %>%
as.numeric() -> movie_runtime
movie_runtime
## [1] 142 175 152 202 201 154 195 161 96 148 139 178 142 179 136 146
## [18] 133 207 169 130 125 169 189 116 106 127 110 118 121 130 139 161
## [35] 149 106 112 130 151 150 113 155 119 88 137 155 89 116 115 147
31
32
Genre
imdb %>%
html_nodes(".lister-item-content p .genre") %>%
html_text() %>%
str_trim() -> movie_genre
movie_genre
## [1] "Drama" "Crime, Drama"
## [3] "Action, Crime, Drama" "Crime, Drama"
## [5] "Adventure, Drama, Fantasy" "Crime, Drama"
## [7] "Biography, Drama, History" "Western"
## [9] "Drama" "Action, Adventure, Sci-Fi"
## [11] "Drama" "Adventure, Drama, Fantasy"
## [13] "Drama, Romance" "Adventure, Drama, Fantasy"
## [15] "Action, Sci-Fi" "Biography, Crime, Drama"
## [17] "Action, Adventure, Fantasy" "Drama"
## [19] "Adventure, Drama" "Adventure, Drama, Sci-Fi"
## [21] "Crime, Drama" "Animation, Adventure, Family"
## [23] "Drama, War" "Crime, Drama, Fantasy"
## [25] "Comedy, Drama, Romance" "Crime, Mystery, Thriller"
## [27] "Crime, Drama, Mystery" "Action, Crime, Drama"
## [29] "Crime, Drama, Thriller" "Action, Adventure, Fantasy"
## [31] "Drama, Family, Fantasy" "Crime, Thriller" 33
34
Rating
imdb %>%
html_nodes(".ratings-bar .ratings-imdb-rating") %>%
html_attr("data-value") %>%
as.numeric() -> movie_rating
movie_rating
## [1] 9.3 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7
## [18] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5
## [35] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5
35
36
37
Votes
imdb %>%
html_nodes(xpath = '//meta[@itemprop="ratingCount"]') %>%
html_attr('content') %>%
as.numeric() -> movie_votes
movie_votes
## [1] 2072893 1422292 2038787 987020 1475650 1621033 1074273 615219
## [9] 585562 1817393 1658750 1492209 1589127 1334563 1489071 895033
## [17] 1040130 822277 280024 1276946 637716 549410 1096231 1000909
## [25] 545280 897576 1271530 913352 1118817 1109777 352837 39132
## [33] 118413 174125 617621 605417 666327 1052901 1064050 633675
## [41] 1021511 1198326 941917 823238 897607 198398 192715 923178
## [49] 803033 542311
38
39
Revenue
imdb %>%
html_nodes(xpath = '//span[@name="nv"]') %>%
html_text() %>%
str_extract(pattern = "^$.*") %>%
na.omit() %>%
as.character() %>%
append(values = NA, after = 30) %>%
append(values = NA, after = 46) %>%
str_sub(start = 2, end = nchar(.) - 1) %>%
as.numeric() -> movie_revenue
movie_revenue
## [1] 28.34 134.97 534.86 57.30 377.85 107.93 96.07 6.10 4.36 2
## [11] 37.03 315.54 330.25 342.55 171.48 46.84 290.48 112.00 0.27 1
## [21] 7.56 10.06 216.54 136.80 57.60 23.34 100.13 19.50 130.74 3
## [31] NA 1.19 12.39 190.24 678.82 13.09 13.18 53.09 132.38
## [41] 25.54 187.71 6.72 312.90 204.84 11.99 NA 210.61 248.16
40
Putting it all together…
top_50 <- tibble(title = movie_title, release = movie_year,
`runtime (mins)` = movie_runtime, genre = movie_genre, rating = movi
votes = movie_votes, `revenue ($ millions)` = movie_revenue)
top_50
## # A tibble: 50 x 7
## title release `runtime (mins)` genre rating votes `revenue (
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 The Sha~ 1994 142 Drama 9.3 2.07e6
## 2 The God~ 1972 175 Crime,~ 9.2 1.42e6
## 3 The Dar~ 2008 152 Action~ 9 2.04e6
## 4 The God~ 1974 202 Crime,~ 9 9.87e5
## 5 The Lor~ 2003 201 Advent~ 8.9 1.48e6
## 6 Pulp Fi~ 1994 154 Crime,~ 8.9 1.62e6
## 7 Schindl~ 1993 195 Biogra~ 8.9 1.07e6
## 8 Il buon~ 1966 161 Western 8.9 6.15e5
## 9 12 Angr~ 1957 96 Drama 8.9 5.86e5
## 10 Incepti~ 2010 148 Action~ 8.8 1.82e6
## # ... with 40 more rows
41
42
robotstxt
paths_allowed(
paths = c("https://en.wikipedia.org/wiki/List_of_Governors_of_Reserve_
)
##
en.wikipedia.org
## [1] TRUE
43
Read Web Page
rbi_guv <- read_html("https://en.wikipedia.org/wiki/List_of_Governors_of
rbi_guv
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-
44
List of Governors
rbi_guv %>%
html_nodes("table") %>%
html_table() %>%
extract2(2) -> profile
profile
## No. Officeholder Portrait Term start Term
## 1 1 Osborne Smith NA 1 April 1935 30 June 1
## 2 2 James Braid Taylor NA 1 July 1937 17 February 1
## 3 3 C. D. Deshmukh NA 11 August 1943ii 30 May 1
## 4 4 Benegal Rama Rau NA 1 July 1949 14 January 1
## 5 5 K. G. Ambegaonkar NA 14 January 1957 28 February 1
## 6 6 H. V. R. Iyengar NA 1 March 1957 28 February 1
## 7 7 P. C. Bhattacharya NA 1 March 1962 30 June 1
## 8 8 Lakshmi Kant Jha NA 1 July 1967 3 May 1
## 9 9 B. N. Adarkar NA 4 May 1970 15 June 1
## 10 10 Sarukkai Jagannathan NA 16 June 1970 19 May 1
## 11 11 N. C. Sen Gupta NA 19 May 1975 19 August 1
## 12 12 K. R. Puri NA 20 August 1975 2 May 1
## 13 13 M. Narasimham NA 3 May 1977 30 November 1
## 14 14 I. G. Patel NA 1 December 1977 15 September 1
## 15 15 Manmohan Singh NA 16 September 1982 14 January 1 45
Sort
profile %>%
separate(`Term in office`, into = c("term", "days")) %>%
select(Officeholder, term) %>%
arrange(desc(as.numeric(term)))
## Officeholder term
## 1 Benegal Rama Rau 2754
## 2 C. D. Deshmukh 2150
## 3 R. N. Malhotra 2147
## 4 Bimal Jalan 2114
## 5 James Braid Taylor 2057
## 6 P. C. Bhattacharya 1947
## 7 Y. Venugopal Reddy 1826
## 8 H. V. R. Iyengar 1825
## 9 D. Subbarao 1825
## 10 Sarukkai Jagannathan 1798
## 11 C. Rangarajan 1795
## 12 I. G. Patel 1749
## 13 Raghuram Rajan 1096
## 14 Lakshmi Kant Jha 1037
## 15 Urjit Patel 947
## 16 Manmohan Singh 851
46
Backgrounds
profile %>%
count(Background)
## # A tibble: 9 x 2
## Background n
## <chr> <int>
## 1 "" 1
## 2 Banker 2
## 3 Career Reserve Bank of India officer 1
## 4 Economist 7
## 5 IAS officer 4
## 6 ICS officer 7
## 7 Indian Administrative Service (IAS) officer 1
## 8 Indian Audit and Accounts Service officer 1
## 9 Indian Civil Service (ICS) officer 1
47
Backgrounds
profile %>%
pull(Background) %>%
fct_collapse(
Bureaucrats = c("IAS officer", "ICS officer",
"Indian Administrative Service (IAS) officer",
"Indian Audit and Accounts Service officer",
"Indian Civil Service (ICS) officer"),
`No Info` = c(""),
`RBI Officer` = c("Career Reserve Bank of India officer")
) %>%
fct_count() %>%
rename(background = f, count = n) -> backgrounds
48
Backgrounds
backgrounds
## # A tibble: 5 x 2
## background count
## <fct> <int>
## 1 No Info 1
## 2 Banker 2
## 3 RBI Officer 1
## 4 Economist 7
## 5 Bureaucrats 14
49
Backgrounds
backgrounds %>%
ggplot() +
geom_col(aes(background, count), fill = "blue") +
xlab("Background") + ylab("Count") +
ggtitle("Background of RBI Governors")
50
51
Summary
web scraping is the extraction of data from web sites
best for static & well structured HTML pages
review robots.txt file
HTML code can change any time
if API is available, please use it
do not overwhelm websites with requests
•
•
•
•
•
•
52
53

More Related Content

What's hot (20)

PPTX
機械学習応用システムのための要求工学
Nobukazu Yoshioka
 
PDF
resume-theorique-m104-2203-6246fb9f3e558 (2).pdf
FootballLovers9
 
PPT
A mezők élővilága - 6. osztály (természetismeret)
Csimax
 
PDF
コンピュテーショナルフォトグラフティの基礎
Norishige Fukushima
 
PPTX
Python 学習教材 (~299ページ)
Jun MITANI
 
PPTX
კომპიუტერის მოწყობილობები
Mari Morchiladze
 
PDF
論文紹介:End-to-End Object Detection with Transformers
Toru Tamaki
 
PDF
SPADE :Semantic Image Synthesis with Spatially-Adaptive Normalization
Tenki Lee
 
PDF
Prefectに関して imperfectに語る
notrogue
 
PPTX
Map
kikairoya
 
PPT
Certifier son Centre de Services NF345 "Centre de relation client"
itSMF France
 
PDF
[DL輪読会]RobustNet: Improving Domain Generalization in Urban- Scene Segmentatio...
Deep Learning JP
 
PDF
CS立体図を用いた地形判読(FOSS4G 2017 Tokyoハンズオン)
OSgeo Japan
 
PDF
CS立体図を自作して公開してみた
Kouichi Kita
 
PDF
三次元表現まとめ(深層学習を中心に)
Tomohiro Motoda
 
PDF
Action Recognitionの歴史と最新動向
Ohnishi Katsunori
 
PPTX
機械学習を用いたfNIRSの解析手法の提案
Reiji Ohkuma
 
PDF
リクルートにおける画像解析事例紹介
Recruit Technologies
 
PPTX
【DL輪読会】HexPlaneとK-Planes
Deep Learning JP
 
PPT
introduction des composants d' ordinateur 7eme annee secondaire
minabintmina
 
機械学習応用システムのための要求工学
Nobukazu Yoshioka
 
resume-theorique-m104-2203-6246fb9f3e558 (2).pdf
FootballLovers9
 
A mezők élővilága - 6. osztály (természetismeret)
Csimax
 
コンピュテーショナルフォトグラフティの基礎
Norishige Fukushima
 
Python 学習教材 (~299ページ)
Jun MITANI
 
კომპიუტერის მოწყობილობები
Mari Morchiladze
 
論文紹介:End-to-End Object Detection with Transformers
Toru Tamaki
 
SPADE :Semantic Image Synthesis with Spatially-Adaptive Normalization
Tenki Lee
 
Prefectに関して imperfectに語る
notrogue
 
Certifier son Centre de Services NF345 "Centre de relation client"
itSMF France
 
[DL輪読会]RobustNet: Improving Domain Generalization in Urban- Scene Segmentatio...
Deep Learning JP
 
CS立体図を用いた地形判読(FOSS4G 2017 Tokyoハンズオン)
OSgeo Japan
 
CS立体図を自作して公開してみた
Kouichi Kita
 
三次元表現まとめ(深層学習を中心に)
Tomohiro Motoda
 
Action Recognitionの歴史と最新動向
Ohnishi Katsunori
 
機械学習を用いたfNIRSの解析手法の提案
Reiji Ohkuma
 
リクルートにおける画像解析事例紹介
Recruit Technologies
 
【DL輪読会】HexPlaneとK-Planes
Deep Learning JP
 
introduction des composants d' ordinateur 7eme annee secondaire
minabintmina
 

Similar to Practical Introduction to Web scraping using R (20)

PDF
Hacking data visualisations
Melinda Seckington
 
PDF
Data Manipulation Using R (& dplyr)
Ram Narasimhan
 
PPTX
R - Get Started I - Sanaitics
Vijith Nair
 
PDF
Regression Model for movies
Mohit Rajput
 
PPTX
R data interfaces
Bhavesh Sarvaiya
 
PDF
R markup code to create Regression Model
Mohit Rajput
 
PDF
Data analystics with R module 3 cseds vtu
LalithauLali
 
PPTX
R for hadoopers
Gwen (Chen) Shapira
 
PDF
Introduction to Data Mining with R and Data Import/Export in R
Yanchang Zhao
 
KEY
R for Pirates. ESCCONF October 27, 2011
Mandi Walls
 
PDF
Introduction to r studio on aws 2020 05_06
Barry DeCicco
 
PDF
Using R For Data Management Statistical Analysis And Graphics 1st Edition Nic...
simpikimal
 
PPTX
Getting started with R when analysing GitHub commits
Barbara Fusinska
 
PDF
Machine Learning in R
Alexandros Karatzoglou
 
PDF
Data Science - The Most Profitable Movie Characteristic
Cheah Eng Soon
 
PDF
Building a Movie Success Predictor
Youness Lahdili
 
PDF
R basics
Sagun Baijal
 
PDF
Import web resources using R Studio
Rupak Roy
 
PDF
R tutorial
Richard Vidgen
 
PDF
R programming & Machine Learning
AmanBhalla14
 
Hacking data visualisations
Melinda Seckington
 
Data Manipulation Using R (& dplyr)
Ram Narasimhan
 
R - Get Started I - Sanaitics
Vijith Nair
 
Regression Model for movies
Mohit Rajput
 
R data interfaces
Bhavesh Sarvaiya
 
R markup code to create Regression Model
Mohit Rajput
 
Data analystics with R module 3 cseds vtu
LalithauLali
 
R for hadoopers
Gwen (Chen) Shapira
 
Introduction to Data Mining with R and Data Import/Export in R
Yanchang Zhao
 
R for Pirates. ESCCONF October 27, 2011
Mandi Walls
 
Introduction to r studio on aws 2020 05_06
Barry DeCicco
 
Using R For Data Management Statistical Analysis And Graphics 1st Edition Nic...
simpikimal
 
Getting started with R when analysing GitHub commits
Barbara Fusinska
 
Machine Learning in R
Alexandros Karatzoglou
 
Data Science - The Most Profitable Movie Characteristic
Cheah Eng Soon
 
Building a Movie Success Predictor
Youness Lahdili
 
R basics
Sagun Baijal
 
Import web resources using R Studio
Rupak Roy
 
R tutorial
Richard Vidgen
 
R programming & Machine Learning
AmanBhalla14
 
Ad

More from Rsquared Academy (20)

PDF
Handling Date & Time in R
Rsquared Academy
 
PDF
Market Basket Analysis in R
Rsquared Academy
 
PDF
Joining Data with dplyr
Rsquared Academy
 
PDF
Explore Data using dplyr
Rsquared Academy
 
PDF
Data Wrangling with dplyr
Rsquared Academy
 
PDF
Writing Readable Code with Pipes
Rsquared Academy
 
PDF
Introduction to tibbles
Rsquared Academy
 
PDF
Read data from Excel spreadsheets into R
Rsquared Academy
 
PDF
Read/Import data from flat/delimited files into R
Rsquared Academy
 
PDF
Variables & Data Types in R
Rsquared Academy
 
PDF
How to install & update R packages?
Rsquared Academy
 
PDF
How to get help in R?
Rsquared Academy
 
PDF
Introduction to R
Rsquared Academy
 
PDF
RMySQL Tutorial For Beginners
Rsquared Academy
 
PDF
R Markdown Tutorial For Beginners
Rsquared Academy
 
PDF
R Data Visualization Tutorial: Bar Plots
Rsquared Academy
 
PDF
R Programming: Introduction to Matrices
Rsquared Academy
 
PDF
R Programming: Introduction to Vectors
Rsquared Academy
 
PPTX
R Programming: Variables & Data Types
Rsquared Academy
 
PDF
Data Visualization With R: Learn To Combine Multiple Graphs
Rsquared Academy
 
Handling Date & Time in R
Rsquared Academy
 
Market Basket Analysis in R
Rsquared Academy
 
Joining Data with dplyr
Rsquared Academy
 
Explore Data using dplyr
Rsquared Academy
 
Data Wrangling with dplyr
Rsquared Academy
 
Writing Readable Code with Pipes
Rsquared Academy
 
Introduction to tibbles
Rsquared Academy
 
Read data from Excel spreadsheets into R
Rsquared Academy
 
Read/Import data from flat/delimited files into R
Rsquared Academy
 
Variables & Data Types in R
Rsquared Academy
 
How to install & update R packages?
Rsquared Academy
 
How to get help in R?
Rsquared Academy
 
Introduction to R
Rsquared Academy
 
RMySQL Tutorial For Beginners
Rsquared Academy
 
R Markdown Tutorial For Beginners
Rsquared Academy
 
R Data Visualization Tutorial: Bar Plots
Rsquared Academy
 
R Programming: Introduction to Matrices
Rsquared Academy
 
R Programming: Introduction to Vectors
Rsquared Academy
 
R Programming: Variables & Data Types
Rsquared Academy
 
Data Visualization With R: Learn To Combine Multiple Graphs
Rsquared Academy
 
Ad

Recently uploaded (20)

PDF
Product Management in HealthTech (Case Studies from SnappDoctor)
Hamed Shams
 
PPTX
BinarySearchTree in datastructures in detail
kichokuttu
 
PPTX
apidays Munich 2025 - Building an AWS Serverless Application with Terraform, ...
apidays
 
PDF
Driving Employee Engagement in a Hybrid World.pdf
Mia scott
 
PDF
apidays Singapore 2025 - The API Playbook for AI by Shin Wee Chuang (PAND AI)
apidays
 
PPTX
apidays Singapore 2025 - From Data to Insights: Building AI-Powered Data APIs...
apidays
 
PPTX
apidays Singapore 2025 - Generative AI Landscape Building a Modern Data Strat...
apidays
 
PDF
Simplifying Document Processing with Docling for AI Applications.pdf
Tamanna36
 
PDF
A GraphRAG approach for Energy Efficiency Q&A
Marco Brambilla
 
PDF
Using AI/ML for Space Biology Research
VICTOR MAESTRE RAMIREZ
 
PDF
Research Methodology Overview Introduction
ayeshagul29594
 
PPTX
apidays Helsinki & North 2025 - Agentic AI: A Friend or Foe?, Merja Kajava (A...
apidays
 
PDF
NIS2 Compliance for MSPs: Roadmap, Benefits & Cybersecurity Trends (2025 Guide)
GRC Kompas
 
PDF
apidays Singapore 2025 - Building a Federated Future, Alex Szomora (GSMA)
apidays
 
PDF
Development and validation of the Japanese version of the Organizational Matt...
Yoga Tokuyoshi
 
PDF
The European Business Wallet: Why It Matters and How It Powers the EUDI Ecosy...
Lal Chandran
 
PPTX
Aict presentation on dpplppp sjdhfh.pptx
vabaso5932
 
PDF
OPPOTUS - Malaysias on Malaysia 1Q2025.pdf
Oppotus
 
PPTX
apidays Helsinki & North 2025 - APIs at Scale: Designing for Alignment, Trust...
apidays
 
PDF
Optimizing Large Language Models with vLLM and Related Tools.pdf
Tamanna36
 
Product Management in HealthTech (Case Studies from SnappDoctor)
Hamed Shams
 
BinarySearchTree in datastructures in detail
kichokuttu
 
apidays Munich 2025 - Building an AWS Serverless Application with Terraform, ...
apidays
 
Driving Employee Engagement in a Hybrid World.pdf
Mia scott
 
apidays Singapore 2025 - The API Playbook for AI by Shin Wee Chuang (PAND AI)
apidays
 
apidays Singapore 2025 - From Data to Insights: Building AI-Powered Data APIs...
apidays
 
apidays Singapore 2025 - Generative AI Landscape Building a Modern Data Strat...
apidays
 
Simplifying Document Processing with Docling for AI Applications.pdf
Tamanna36
 
A GraphRAG approach for Energy Efficiency Q&A
Marco Brambilla
 
Using AI/ML for Space Biology Research
VICTOR MAESTRE RAMIREZ
 
Research Methodology Overview Introduction
ayeshagul29594
 
apidays Helsinki & North 2025 - Agentic AI: A Friend or Foe?, Merja Kajava (A...
apidays
 
NIS2 Compliance for MSPs: Roadmap, Benefits & Cybersecurity Trends (2025 Guide)
GRC Kompas
 
apidays Singapore 2025 - Building a Federated Future, Alex Szomora (GSMA)
apidays
 
Development and validation of the Japanese version of the Organizational Matt...
Yoga Tokuyoshi
 
The European Business Wallet: Why It Matters and How It Powers the EUDI Ecosy...
Lal Chandran
 
Aict presentation on dpplppp sjdhfh.pptx
vabaso5932
 
OPPOTUS - Malaysias on Malaysia 1Q2025.pdf
Oppotus
 
apidays Helsinki & North 2025 - APIs at Scale: Designing for Alignment, Trust...
apidays
 
Optimizing Large Language Models with vLLM and Related Tools.pdf
Tamanna36
 

Practical Introduction to Web scraping using R

  • 1. 1
  • 2. Connect With Us Website ( ) Free Online R Courses ( ) R Packages ( ) Shiny Apps ( ) Blog ( ) GitHub ( ) YouTube ( ) Twitter ( ) Facebook ( ) Linkedin ( ) • https://www.rsquaredacademy.com/ • https://rsquared-academy.thinkific.com/ • https://pkgs.rsquaredacademy.com • https://apps.rsquaredacademy.com • https://blog.rsquaredacademy.com • https://github.com/rsquaredacademy • https://www.youtube.com/user/rsquaredin/ • https://twitter.com/rsquaredacademy • https://www.facebook.com/rsquaredacademy/ • https://in.linkedin.com/company/rsquared-academy 2
  • 3. what? why? how? use cases HTML basics case studies • • • • • • 3
  • 4. 4
  • 5. 5
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 10. 10
  • 11. 11
  • 12. 12
  • 13. 13
  • 14. 14
  • 15. 15
  • 16. 16
  • 17. 17
  • 18. 18
  • 19. 19
  • 21. 21
  • 23. Read Web Page imdb <- read_html("https://www.imdb.com/search/title?groups=top_250&sort imdb ## {xml_document} ## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/ ## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars ## [2] <body id="styleguide-v2" class="fixed">nn <img heigh 23
  • 24. 24
  • 25. Title imdb %>% html_nodes(".lister-item-content h3 a") %>% html_text() -> movie_title movie_title ## [1] "The Shawshank Redemption" ## [2] "The Godfather" ## [3] "The Dark Knight" ## [4] "The Godfather: Part II" ## [5] "The Lord of the Rings: The Return of the King" ## [6] "Pulp Fiction" ## [7] "Schindler's List" ## [8] "Il buono, il brutto, il cattivo" ## [9] "12 Angry Men" ## [10] "Inception" ## [11] "Fight Club" ## [12] "The Lord of the Rings: The Fellowship of the Ring" ## [13] "Forrest Gump" ## [14] "The Lord of the Rings: The Two Towers" ## [15] "The Matrix" ## [16] "Goodfellas" ## [17] "Star Wars: Episode V - The Empire Strikes Back" 25
  • 26. 26
  • 27. Year of Release imdb %>% html_nodes(".lister-item-content h3 .lister-item-year") %>% html_text() %>% str_sub(start = 2, end = 5) %>% as.Date(format = "%Y") %>% year() -> movie_year movie_year ## [1] 1994 1972 2008 1974 2003 1994 1993 1966 1957 2010 1999 2001 1994 ## [15] 1999 1990 1980 1975 1954 2014 2002 2001 1998 1999 1997 1995 1995 ## [29] 1991 1977 1946 2018 2016 2018 2018 2014 2011 2006 2006 2002 2000 ## [43] 1998 1994 1991 1988 1988 1985 1981 1979 27
  • 28. 28
  • 29. Certificate imdb %>% html_nodes(".lister-item-content p .certificate") %>% html_text() -> movie_certificate movie_certificate ## [1] "A" "A" "UA" "PG-13" "A" "A" "UA" "A" ## [9] "PG-13" "PG-13" "PG-13" "A" "A" "PG" "UA" "R" ## [17] "PG" "A" "A" "PG-13" "A" "R" "A" "A" ## [25] "U" "PG" "UA" "U" "U" "UA" "A" "UA" ## [33] "PG-13" "A" "R" "R" "R" "A" "U" "U" ## [41] "R" "U" "PG" "R" 29
  • 30. 30
  • 31. Runtime imdb %>% html_nodes(".lister-item-content p .runtime") %>% html_text() %>% str_split(" ") %>% map_chr(1) %>% as.numeric() -> movie_runtime movie_runtime ## [1] 142 175 152 202 201 154 195 161 96 148 139 178 142 179 136 146 ## [18] 133 207 169 130 125 169 189 116 106 127 110 118 121 130 139 161 ## [35] 149 106 112 130 151 150 113 155 119 88 137 155 89 116 115 147 31
  • 32. 32
  • 33. Genre imdb %>% html_nodes(".lister-item-content p .genre") %>% html_text() %>% str_trim() -> movie_genre movie_genre ## [1] "Drama" "Crime, Drama" ## [3] "Action, Crime, Drama" "Crime, Drama" ## [5] "Adventure, Drama, Fantasy" "Crime, Drama" ## [7] "Biography, Drama, History" "Western" ## [9] "Drama" "Action, Adventure, Sci-Fi" ## [11] "Drama" "Adventure, Drama, Fantasy" ## [13] "Drama, Romance" "Adventure, Drama, Fantasy" ## [15] "Action, Sci-Fi" "Biography, Crime, Drama" ## [17] "Action, Adventure, Fantasy" "Drama" ## [19] "Adventure, Drama" "Adventure, Drama, Sci-Fi" ## [21] "Crime, Drama" "Animation, Adventure, Family" ## [23] "Drama, War" "Crime, Drama, Fantasy" ## [25] "Comedy, Drama, Romance" "Crime, Mystery, Thriller" ## [27] "Crime, Drama, Mystery" "Action, Crime, Drama" ## [29] "Crime, Drama, Thriller" "Action, Adventure, Fantasy" ## [31] "Drama, Family, Fantasy" "Crime, Thriller" 33
  • 34. 34
  • 35. Rating imdb %>% html_nodes(".ratings-bar .ratings-imdb-rating") %>% html_attr("data-value") %>% as.numeric() -> movie_rating movie_rating ## [1] 9.3 9.2 9.0 9.0 8.9 8.9 8.9 8.9 8.9 8.8 8.8 8.8 8.8 8.7 8.7 8.7 ## [18] 8.7 8.7 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.6 8.5 8.5 ## [35] 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5 35
  • 36. 36
  • 37. 37
  • 38. Votes imdb %>% html_nodes(xpath = '//meta[@itemprop="ratingCount"]') %>% html_attr('content') %>% as.numeric() -> movie_votes movie_votes ## [1] 2072893 1422292 2038787 987020 1475650 1621033 1074273 615219 ## [9] 585562 1817393 1658750 1492209 1589127 1334563 1489071 895033 ## [17] 1040130 822277 280024 1276946 637716 549410 1096231 1000909 ## [25] 545280 897576 1271530 913352 1118817 1109777 352837 39132 ## [33] 118413 174125 617621 605417 666327 1052901 1064050 633675 ## [41] 1021511 1198326 941917 823238 897607 198398 192715 923178 ## [49] 803033 542311 38
  • 39. 39
  • 40. Revenue imdb %>% html_nodes(xpath = '//span[@name="nv"]') %>% html_text() %>% str_extract(pattern = "^$.*") %>% na.omit() %>% as.character() %>% append(values = NA, after = 30) %>% append(values = NA, after = 46) %>% str_sub(start = 2, end = nchar(.) - 1) %>% as.numeric() -> movie_revenue movie_revenue ## [1] 28.34 134.97 534.86 57.30 377.85 107.93 96.07 6.10 4.36 2 ## [11] 37.03 315.54 330.25 342.55 171.48 46.84 290.48 112.00 0.27 1 ## [21] 7.56 10.06 216.54 136.80 57.60 23.34 100.13 19.50 130.74 3 ## [31] NA 1.19 12.39 190.24 678.82 13.09 13.18 53.09 132.38 ## [41] 25.54 187.71 6.72 312.90 204.84 11.99 NA 210.61 248.16 40
  • 41. Putting it all together… top_50 <- tibble(title = movie_title, release = movie_year, `runtime (mins)` = movie_runtime, genre = movie_genre, rating = movi votes = movie_votes, `revenue ($ millions)` = movie_revenue) top_50 ## # A tibble: 50 x 7 ## title release `runtime (mins)` genre rating votes `revenue ( ## <chr> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 The Sha~ 1994 142 Drama 9.3 2.07e6 ## 2 The God~ 1972 175 Crime,~ 9.2 1.42e6 ## 3 The Dar~ 2008 152 Action~ 9 2.04e6 ## 4 The God~ 1974 202 Crime,~ 9 9.87e5 ## 5 The Lor~ 2003 201 Advent~ 8.9 1.48e6 ## 6 Pulp Fi~ 1994 154 Crime,~ 8.9 1.62e6 ## 7 Schindl~ 1993 195 Biogra~ 8.9 1.07e6 ## 8 Il buon~ 1966 161 Western 8.9 6.15e5 ## 9 12 Angr~ 1957 96 Drama 8.9 5.86e5 ## 10 Incepti~ 2010 148 Action~ 8.8 1.82e6 ## # ... with 40 more rows 41
  • 42. 42
  • 44. Read Web Page rbi_guv <- read_html("https://en.wikipedia.org/wiki/List_of_Governors_of rbi_guv ## {xml_document} ## <html class="client-nojs" lang="en" dir="ltr"> ## [1] <head>n<meta http-equiv="Content-Type" content="text/html; chars ## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns- 44
  • 45. List of Governors rbi_guv %>% html_nodes("table") %>% html_table() %>% extract2(2) -> profile profile ## No. Officeholder Portrait Term start Term ## 1 1 Osborne Smith NA 1 April 1935 30 June 1 ## 2 2 James Braid Taylor NA 1 July 1937 17 February 1 ## 3 3 C. D. Deshmukh NA 11 August 1943ii 30 May 1 ## 4 4 Benegal Rama Rau NA 1 July 1949 14 January 1 ## 5 5 K. G. Ambegaonkar NA 14 January 1957 28 February 1 ## 6 6 H. V. R. Iyengar NA 1 March 1957 28 February 1 ## 7 7 P. C. Bhattacharya NA 1 March 1962 30 June 1 ## 8 8 Lakshmi Kant Jha NA 1 July 1967 3 May 1 ## 9 9 B. N. Adarkar NA 4 May 1970 15 June 1 ## 10 10 Sarukkai Jagannathan NA 16 June 1970 19 May 1 ## 11 11 N. C. Sen Gupta NA 19 May 1975 19 August 1 ## 12 12 K. R. Puri NA 20 August 1975 2 May 1 ## 13 13 M. Narasimham NA 3 May 1977 30 November 1 ## 14 14 I. G. Patel NA 1 December 1977 15 September 1 ## 15 15 Manmohan Singh NA 16 September 1982 14 January 1 45
  • 46. Sort profile %>% separate(`Term in office`, into = c("term", "days")) %>% select(Officeholder, term) %>% arrange(desc(as.numeric(term))) ## Officeholder term ## 1 Benegal Rama Rau 2754 ## 2 C. D. Deshmukh 2150 ## 3 R. N. Malhotra 2147 ## 4 Bimal Jalan 2114 ## 5 James Braid Taylor 2057 ## 6 P. C. Bhattacharya 1947 ## 7 Y. Venugopal Reddy 1826 ## 8 H. V. R. Iyengar 1825 ## 9 D. Subbarao 1825 ## 10 Sarukkai Jagannathan 1798 ## 11 C. Rangarajan 1795 ## 12 I. G. Patel 1749 ## 13 Raghuram Rajan 1096 ## 14 Lakshmi Kant Jha 1037 ## 15 Urjit Patel 947 ## 16 Manmohan Singh 851 46
  • 47. Backgrounds profile %>% count(Background) ## # A tibble: 9 x 2 ## Background n ## <chr> <int> ## 1 "" 1 ## 2 Banker 2 ## 3 Career Reserve Bank of India officer 1 ## 4 Economist 7 ## 5 IAS officer 4 ## 6 ICS officer 7 ## 7 Indian Administrative Service (IAS) officer 1 ## 8 Indian Audit and Accounts Service officer 1 ## 9 Indian Civil Service (ICS) officer 1 47
  • 48. Backgrounds profile %>% pull(Background) %>% fct_collapse( Bureaucrats = c("IAS officer", "ICS officer", "Indian Administrative Service (IAS) officer", "Indian Audit and Accounts Service officer", "Indian Civil Service (ICS) officer"), `No Info` = c(""), `RBI Officer` = c("Career Reserve Bank of India officer") ) %>% fct_count() %>% rename(background = f, count = n) -> backgrounds 48
  • 49. Backgrounds backgrounds ## # A tibble: 5 x 2 ## background count ## <fct> <int> ## 1 No Info 1 ## 2 Banker 2 ## 3 RBI Officer 1 ## 4 Economist 7 ## 5 Bureaucrats 14 49
  • 50. Backgrounds backgrounds %>% ggplot() + geom_col(aes(background, count), fill = "blue") + xlab("Background") + ylab("Count") + ggtitle("Background of RBI Governors") 50
  • 51. 51
  • 52. Summary web scraping is the extraction of data from web sites best for static & well structured HTML pages review robots.txt file HTML code can change any time if API is available, please use it do not overwhelm websites with requests • • • • • • 52
  • 53. 53