Natural language procesing in R

Comcast Consumer Complaints
A ﬁrst approach in R language
Olabanji Shonibare
oyshonib@mtu.edu

Natural Language Processing (NLP)
NLP is a set of techniques for approaching text
problems

Natural Language Processing (NLP)
A few questions
• Word frequency
• Variation across states

Comcast Consumer Complaints
• comcast_consumeraﬀairs_complaints.csv
• comcast_fcc_complaints_2015.csv
Raw complaint data about Comcast television and internet
published at consumeraﬀairs.com between 04/08 and 09/16.
Raw complaints made to the FCC about Comcast between 04/15
and 06/15.

Preliminaries
df <- read.csv("comcast_consumeraffairs_complaints.csv") 
df_fcc <- read.csv("comcast_fcc_complaints_2015.csv") 
 
dim(df)
## [1] 5659 4
names(df)
## [1] "author" "posted_on" "rating" "text"
dim(df_fcc)
## [1] 2225 11
names(df_fcc)
## [1] "Ticket.." "Customer.Complaint"  
## [3] "Date" "Time"  
## [5] "Received.Via" "City"  
## [7] "State" "Zip.code"  
## [9] "Status" "Filing.on.Behalf.of.Someone"

Comcast Consumer Aﬀairs Complaints

ggplot(df) + 
geom_bar( aes(x=rating))

df %>% group_by(rating) %>% summarise(count = n())
## # A tibble: 6 × 2 
## rating count 
## <int> <int> 
## 1 0 1560 
## 2 1 3734 
## 3 2 260 
## 4 3 54 
## 5 4 19 
## 6 5 32
df2 <- df %>% filter(rating != 0) 
 
ggplot(df2) + 
geom_bar( aes(x=rating))

df3 <- 
df2 %>%  
mutate( State = str_sub(toupper(author), -2))  
 
df3 %>% 
group_by(State) %>% 
summarise(Count = n()) %>% 
arrange(desc(Count))
## # A tibble: 52 × 2 
## State Count 
## <chr> <int> 
## 1 FL 650 
## 2 CA 345 
## 3 GA 320 
## 4 IL 284 
## 5 PA 221 
## 6 TN 202 
## 7 TX 193 
## 8 MI 189 
## 9 WA 168 
## 10 NJ 167 
## # ... with 42 more rows

low_rating <- 
df2 %>%  
filter(rating < 3) 
 
high_rating <- 
df2 %>%  
filter(rating >= 3)
nrow(low_rating)
## [1] 3994
nrow(high_rating)
## [1] 105
#cs_ratio: customer satisfaction ratio 
 
df3 %>% 
select(State, rating) %>% 
group_by(State) %>% 
summarise(cs_ratio = length(rating[rating>2])/length(rating)) %>% 
arrange(desc(cs_ratio))
## # A tibble: 52 × 2 
## State cs_ratio 
## <chr> <dbl> 
## 1 IA 1.00000000 
## 2 ID 1.00000000 
## 3 BC 0.50000000 
## 4 NV 0.33333333 
## 5 WV 0.13333333 
## 6 NH 0.10714286 
## 7 MO 0.09090909 
## 8 ER 0.06666667 
## 9 SC 0.05263158 
## 10 AZ 0.05000000 
States with high customer satisfaction ratio (rating >2)

Word cloud for low ratings
low_stops <- c('comcast', stopwords("english")) 
 
low_ratingCorpus <-  
Corpus(VectorSource(low_rating$text)) %>% 
tm_map(removePunctuation) %>% 
tm_map(removeNumbers) %>% 
tm_map(tolower) %>% 
tm_map(removeWords, low_stops) %>% 
tm_map(removeWords, stopwords("english")) %>% 
tm_map(stripWhitespace) %>% 
tm_map(PlainTextDocument) 
#tm_map(stemDocument) 
 
wordcloud(low_ratingCorpus, scale=c(5,0.5), max.words=100, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))

Word cloud for high ratings
temp_stops <- c('comcast', stopwords("english")) 
 
high_ratingCorpus <-  
Corpus(VectorSource(high_rating$text)) %>% 
#tm_map(removeWords, low_rating_stops) %>% 
tm_map(removeWords, temp_stops) %>% 
 
wordcloud(high_ratingCorpus, scale=c(5,0.5), max.words=100,
random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))

# State vs number of complaints 
temp1 <- 
df_fcc %>%  
group_by(State) %>%  
summarise(no_complaints = n()) %>% 
arrange(desc(no_complaints)) 
 
#temp1 
 
ggplot(temp1, aes(x=State, y=no_complaints)) + 
geom_bar( stat ="identity") + 
scale_x_discrete(limits = temp1[["State"]]) + 
theme(axis.text.x = element_text(angle = 60, hjust = 1))

# grab the first 10 rows 
temp2 <- 
temp1 %>% 
slice(1:10)  
 
ggplot(temp2, aes(x=State, y=no_complaints)) + 
scale_x_discrete(limits = temp2[["State"]]) + 

temp3 <- 
df_fcc %>%  
group_by(City) %>%  
arrange(desc(no_complaints)) 
## # A tibble: 928 × 2 
## City no_complaints 
## <fctr> <int> 
## 1 Atlanta 63 
## 2 Chicago 47 
## 3 Knoxville 36 
## 4 Houston 33 
## 5 Jacksonville 31 
## 6 Philadelphia 25 
## 7 Denver 22 
## 8 Miami 22 
## 9 Nashville 22 
## 10 Indianapolis 21 

ggplot(temp3, aes(x=City, y=no_complaints)) + 
scale_x_discrete(limits = temp3[["City"]]) + 
theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 1))

temp4 <- 
temp3 %>% 
slice(1:20)  
## # A tibble: 20 × 2 
## City no_complaints 
## <fctr> <int> 
## 1 Atlanta 63 
## 2 Chicago 47 
## 3 Knoxville 36 
## 4 Houston 33 
## 5 Jacksonville 31 
## 6 Philadelphia 25 
## 7 Denver 22 
## 8 Miami 22 
## 9 Nashville 22 
## 10 Indianapolis 21 
## 11 San Francisco 20 
## 12 San Jose 20 
## 13 Baltimore 19 
## 14 Tucson 19 
## 15 Washington 19 
## 16 Marietta 16 
## 17 Portland 16 
## 18 Seattle 14 
## 19 Memphis 13 
## 20 Canton 12

ggplot(temp4, aes(x=City, y=no_complaints)) + 
scale_x_discrete(limits = temp4[["City"]]) + 

df_fcc %>%  
group_by(Customer.Complaint) %>%  
arrange(desc(no_complaints))
## # A tibble: 1,842 × 2 
## Customer.Complaint no_complaints 
## <fctr> <int> 
## 1 Comcast 83 
## 2 Comcast Internet 18 
## 3 Comcast Data Cap 17 
## 4 comcast 13 
## 5 Comcast Billing 11 
## 6 Comcast Data Caps 11 
## 7 Data Caps 11 
## 8 Unfair Billing Practices 9 
## 9 Comcast data cap 8 
## 10 Comcast data caps 8 
## # ... with 1,832 more rows

all_stops <- c(‘comcast','now','company','day','someone','thing','also',
'got','way','call','called','one','said','tell', stopwords("english")) 
 
df_fccCorpus <-  
Corpus(VectorSource(df_fcc$Customer.Complaint)) %>% 
tm_map(removeWords, all_stops) %>% 
 
wordcloud(df_fccCorpus, scale=c(5,0.5), max.words=100, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))

https://www.kaggle.com/dan195/d/archaeocharlie/
comcastcomplaints/ﬁrst-run
Credits:

Natural language procesing in R

More Related Content

Viewers also liked (8)

Recently uploaded (20)

Natural language procesing in R