SlideShare a Scribd company logo
Comcast Consumer Complaints
A first approach in R language
Olabanji Shonibare
oyshonib@mtu.edu
Natural Language Processing (NLP)
NLP is a set of techniques for approaching text
problems
Natural Language Processing (NLP)
A few questions
• Word frequency
• Variation across states
Comcast Consumer Complaints
• comcast_consumeraffairs_complaints.csv
• comcast_fcc_complaints_2015.csv
Raw complaint data about Comcast television and internet
published at consumeraffairs.com between 04/08 and 09/16.
Raw complaints made to the FCC about Comcast between 04/15
and 06/15.
Preliminaries
df	<-	read.csv("comcast_consumeraffairs_complaints.csv")

df_fcc	<-	read.csv("comcast_fcc_complaints_2015.csv")



dim(df)	
##	[1]	5659				4	
names(df)	
##	[1]	"author"				"posted_on"	"rating"				"text"	
dim(df_fcc)	
##	[1]	2225			11	
names(df_fcc)	
##		[1]	"Ticket.."																				"Customer.Complaint"									

##		[3]	"Date"																								"Time"																							

##		[5]	"Received.Via"																"City"																							

##		[7]	"State"																							"Zip.code"																			

##		[9]	"Status"																						"Filing.on.Behalf.of.Someone"

Comcast Consumer Affairs Complaints
ggplot(df)	+

		geom_bar(	aes(x=rating))
df	%>%	group_by(rating)	%>%	summarise(count	=	n())	
##	#	A	tibble:	6	×	2

##			rating	count

##				<int>	<int>

##	1						0		1560

##	2						1		3734

##	3						2			260

##	4						3				54

##	5						4				19

##	6						5				32	
df2	<-	df	%>%	filter(rating	!=	0)



ggplot(df2)	+

		geom_bar(	aes(x=rating))
df3	<-

		df2	%>%	

		mutate(	State	=	str_sub(toupper(author),	-2))	

		

df3	%>%

		group_by(State)	%>%

		summarise(Count	=	n())	%>%

		arrange(desc(Count))	
##	#	A	tibble:	52	×	2

##				State	Count

##				<chr>	<int>

##	1					FL			650

##	2					CA			345

##	3					GA			320

##	4					IL			284

##	5					PA			221

##	6					TN			202

##	7					TX			193

##	8					MI			189

##	9					WA			168

##	10				NJ			167

##	#	...	with	42	more	rows
low_rating	<-

df2	%>%	

		filter(rating	<	3)



high_rating	<-

df2	%>%	

		filter(rating	>=	3)	
nrow(low_rating)	
##	[1]	3994	
nrow(high_rating)	
##	[1]	105	
#cs_ratio:	customer	satisfaction	ratio



df3	%>%

		select(State,	rating)	%>%

		group_by(State)	%>%

		summarise(cs_ratio	=	length(rating[rating>2])/length(rating))	%>%

		arrange(desc(cs_ratio))	
##	#	A	tibble:	52	×	2

##				State			cs_ratio

##				<chr>						<dbl>

##	1					IA	1.00000000

##	2					ID	1.00000000

##	3					BC	0.50000000

##	4					NV	0.33333333

##	5					WV	0.13333333

##	6					NH	0.10714286

##	7					MO	0.09090909

##	8					ER	0.06666667

##	9					SC	0.05263158

##	10				AZ	0.05000000

##	#	...	with	42	more	rows	
States with high customer satisfaction ratio (rating >2)
Word cloud for low ratings
low_stops	<-	c('comcast',	stopwords("english"))



low_ratingCorpus	<-	

		Corpus(VectorSource(low_rating$text))	%>%

		tm_map(removePunctuation)	%>%

		tm_map(removeNumbers)	%>%

		tm_map(tolower)		%>%

		tm_map(removeWords,	low_stops)	%>%

		tm_map(removeWords,	stopwords("english"))	%>%

		tm_map(stripWhitespace)	%>%

		tm_map(PlainTextDocument)

		#tm_map(stemDocument)



wordcloud(low_ratingCorpus,	scale=c(5,0.5),	max.words=100,	random.order=FALSE,	
rot.per=0.35,	use.r.layout=FALSE,	colors=brewer.pal(8,	"Dark2"))
Word cloud for low ratings
low_stops	<-	c('comcast',	stopwords("english"))



low_ratingCorpus	<-	

		Corpus(VectorSource(low_rating$text))	%>%

		tm_map(removePunctuation)	%>%

		tm_map(removeNumbers)	%>%

		tm_map(tolower)		%>%

		tm_map(removeWords,	low_stops)	%>%

		tm_map(removeWords,	stopwords("english"))	%>%

		tm_map(stripWhitespace)	%>%

		tm_map(PlainTextDocument)

		#tm_map(stemDocument)



wordcloud(low_ratingCorpus,	scale=c(5,0.5),	max.words=100,	random.order=FALSE,	
rot.per=0.35,	use.r.layout=FALSE,	colors=brewer.pal(8,	"Dark2"))
Word cloud for high ratings
temp_stops	<-	c('comcast',	stopwords("english"))



high_ratingCorpus	<-	

		Corpus(VectorSource(high_rating$text))	%>%

		tm_map(removePunctuation)	%>%

		tm_map(removeNumbers)	%>%

		tm_map(tolower)		%>%

		#tm_map(removeWords,	low_rating_stops)	%>%

		tm_map(removeWords,	temp_stops)	%>%

		tm_map(stripWhitespace)	%>%

		tm_map(PlainTextDocument)

		#tm_map(stemDocument)



wordcloud(high_ratingCorpus,	scale=c(5,0.5),	max.words=100,		
random.order=FALSE,	rot.per=0.35,	use.r.layout=FALSE,	colors=brewer.pal(8,	"Dark2"))
Comcast Fcc Complaints (2015)
#	State	vs	number	of	complaints

temp1	<-

df_fcc	%>%	

		group_by(State)	%>%	

		summarise(no_complaints	=	n())	%>%

		arrange(desc(no_complaints))



#temp1



ggplot(temp1,	aes(x=State,	y=no_complaints))	+

		geom_bar(	stat	="identity")	+

		scale_x_discrete(limits	=	temp1[["State"]])	+

		theme(axis.text.x	=	element_text(angle	=	60,	hjust	=	1))
#	grab	the	first	10	rows

temp2	<-

temp1	%>%

		slice(1:10)	



ggplot(temp2,	aes(x=State,	y=no_complaints))	+

		geom_bar(	stat	="identity")	+

		scale_x_discrete(limits	=	temp2[["State"]])	+

		theme(axis.text.x	=	element_text(angle	=	60,	hjust	=	1))
temp3	<-

df_fcc	%>%	

		group_by(City)	%>%	

		summarise(no_complaints	=	n())	%>%

		arrange(desc(no_complaints))

##	#	A	tibble:	928	×	2

##												City	no_complaints

##										<fctr>									<int>

##	1							Atlanta												63

##	2							Chicago												47

##	3					Knoxville												36

##	4							Houston												33

##	5		Jacksonville												31

##	6		Philadelphia												25

##	7								Denver												22

##	8									Miami												22

##	9					Nashville												22

##	10	Indianapolis												21

##	#	...	with	918	more	rows
ggplot(temp3,	aes(x=City,	y=no_complaints))	+

		geom_bar(	stat	="identity")	+

		scale_x_discrete(limits	=	temp3[["City"]])	+

		theme(axis.text.x	=	element_text(angle	=	60,	hjust	=	1,	size	=	1))
temp4	<-

temp3	%>%

		slice(1:20)	

##	#	A	tibble:	20	×	2

##													City	no_complaints

##											<fctr>									<int>

##	1								Atlanta												63

##	2								Chicago												47

##	3						Knoxville												36

##	4								Houston												33

##	5			Jacksonville												31

##	6			Philadelphia												25

##	7									Denver												22

##	8										Miami												22

##	9						Nashville												22

##	10		Indianapolis												21

##	11	San	Francisco												20

##	12						San	Jose												20

##	13					Baltimore												19

##	14								Tucson												19

##	15				Washington												19

##	16						Marietta												16

##	17						Portland												16

##	18							Seattle												14

##	19							Memphis												13

##	20								Canton												12
ggplot(temp4,	aes(x=City,	y=no_complaints))	+

		geom_bar(	stat	="identity")	+

		scale_x_discrete(limits	=	temp4[["City"]])	+

		theme(axis.text.x	=	element_text(angle	=	60,	hjust	=	1))
df_fcc	%>%	

		group_by(Customer.Complaint)	%>%	

		summarise(no_complaints	=	n())	%>%

		arrange(desc(no_complaints))	
##	#	A	tibble:	1,842	×	2

##										Customer.Complaint	no_complaints

##																						<fctr>									<int>

##	1																			Comcast												83

##	2										Comcast	Internet												18

##	3										Comcast	Data	Cap												17

##	4																			comcast												13

##	5											Comcast	Billing												11

##	6									Comcast	Data	Caps												11

##	7																	Data	Caps												11

##	8		Unfair	Billing	Practices													9

##	9										Comcast	data	cap													8

##	10								Comcast	data	caps													8

##	#	...	with	1,832	more	rows
all_stops	<-	c(‘comcast','now','company','day','someone','thing','also',	
'got','way','call','called','one','said','tell',	stopwords("english"))



df_fccCorpus	<-	

		Corpus(VectorSource(df_fcc$Customer.Complaint))	%>%

		tm_map(removePunctuation)	%>%

		tm_map(removeNumbers)	%>%

		tm_map(tolower)		%>%

		tm_map(removeWords,	all_stops)	%>%

		tm_map(stripWhitespace)	%>%

		tm_map(PlainTextDocument)

		#tm_map(stemDocument)



wordcloud(df_fccCorpus,	scale=c(5,0.5),	max.words=100,	random.order=FALSE,		
rot.per=0.35,	use.r.layout=FALSE,	colors=brewer.pal(8,	"Dark2"))
https://www.kaggle.com/dan195/d/archaeocharlie/
comcastcomplaints/first-run
Credits:

More Related Content

PDF
Natural language processing using comcast reviews
Olabanji Shonibare
 
PPTX
Startupfest 2015: HARPER REED (Modest, Inc.) - Lightning Keynote
Startupfest
 
PPTX
The conflict between netflix and comcast
Amtheyst Floyd
 
PDF
Intro to nlp
Rutu Mulkar-Mehta
 
PPTX
Natural language processing 2
Tony Vo
 
PPSX
Gordana Panajotović - NLP Master
NLP Centar Beograd
 
PPTX
Text Mining Infrastructure in R
Ashraf Uddin
 
PDF
Introduction to nlp 2014
Grant Hamel
 
Natural language processing using comcast reviews
Olabanji Shonibare
 
Startupfest 2015: HARPER REED (Modest, Inc.) - Lightning Keynote
Startupfest
 
The conflict between netflix and comcast
Amtheyst Floyd
 
Intro to nlp
Rutu Mulkar-Mehta
 
Natural language processing 2
Tony Vo
 
Gordana Panajotović - NLP Master
NLP Centar Beograd
 
Text Mining Infrastructure in R
Ashraf Uddin
 
Introduction to nlp 2014
Grant Hamel
 

Viewers also liked (8)

PDF
NLP& Bigdata. Motivation and Action
Sarath P R
 
PPTX
Text analytics in Python and R with examples from Tobacco Control
Ben Healey
 
PDF
Introducing natural language processing(NLP) with r
Vivian S. Zhang
 
PPTX
Natural Language Processing in R (rNLP)
fridolin.wild
 
PDF
Natural Language Processing with Python
Benjamin Bengfort
 
KEY
R by example: mining Twitter for consumer attitudes towards airlines
Jeffrey Breen
 
KEY
NLTK in 20 minutes
Jacob Perkins
 
PDF
Text Mining with R -- an Analysis of Twitter Data
Yanchang Zhao
 
NLP& Bigdata. Motivation and Action
Sarath P R
 
Text analytics in Python and R with examples from Tobacco Control
Ben Healey
 
Introducing natural language processing(NLP) with r
Vivian S. Zhang
 
Natural Language Processing in R (rNLP)
fridolin.wild
 
Natural Language Processing with Python
Benjamin Bengfort
 
R by example: mining Twitter for consumer attitudes towards airlines
Jeffrey Breen
 
NLTK in 20 minutes
Jacob Perkins
 
Text Mining with R -- an Analysis of Twitter Data
Yanchang Zhao
 
Ad

Recently uploaded (20)

PDF
Automating ArcGIS Content Discovery with FME: A Real World Use Case
Safe Software
 
PPTX
cloud computing vai.pptx for the project
vaibhavdobariyal79
 
PDF
Orbitly Pitch Deck|A Mission-Driven Platform for Side Project Collaboration (...
zz41354899
 
PDF
Using Anchore and DefectDojo to Stand Up Your DevSecOps Function
Anchore
 
PPTX
AI and Robotics for Human Well-being.pptx
JAYMIN SUTHAR
 
PDF
Data_Analytics_vs_Data_Science_vs_BI_by_CA_Suvidha_Chaplot.pdf
CA Suvidha Chaplot
 
PDF
CIFDAQ's Market Wrap : Bears Back in Control?
CIFDAQ
 
PPTX
IT Runs Better with ThousandEyes AI-driven Assurance
ThousandEyes
 
PDF
Google I/O Extended 2025 Baku - all ppts
HusseinMalikMammadli
 
PPTX
The-Ethical-Hackers-Imperative-Safeguarding-the-Digital-Frontier.pptx
sujalchauhan1305
 
PDF
A Day in the Life of Location Data - Turning Where into How.pdf
Precisely
 
PDF
How-Cloud-Computing-Impacts-Businesses-in-2025-and-Beyond.pdf
Artjoker Software Development Company
 
PDF
Cloud-Migration-Best-Practices-A-Practical-Guide-to-AWS-Azure-and-Google-Clou...
Artjoker Software Development Company
 
PDF
Structs to JSON: How Go Powers REST APIs
Emily Achieng
 
PDF
Event Presentation Google Cloud Next Extended 2025
minhtrietgect
 
PDF
Brief History of Internet - Early Days of Internet
sutharharshit158
 
PDF
A Strategic Analysis of the MVNO Wave in Emerging Markets.pdf
IPLOOK Networks
 
PDF
The Evolution of KM Roles (Presented at Knowledge Summit Dublin 2025)
Enterprise Knowledge
 
PDF
Economic Impact of Data Centres to the Malaysian Economy
flintglobalapac
 
PPTX
Dev Dives: Automate, test, and deploy in one place—with Unified Developer Exp...
AndreeaTom
 
Automating ArcGIS Content Discovery with FME: A Real World Use Case
Safe Software
 
cloud computing vai.pptx for the project
vaibhavdobariyal79
 
Orbitly Pitch Deck|A Mission-Driven Platform for Side Project Collaboration (...
zz41354899
 
Using Anchore and DefectDojo to Stand Up Your DevSecOps Function
Anchore
 
AI and Robotics for Human Well-being.pptx
JAYMIN SUTHAR
 
Data_Analytics_vs_Data_Science_vs_BI_by_CA_Suvidha_Chaplot.pdf
CA Suvidha Chaplot
 
CIFDAQ's Market Wrap : Bears Back in Control?
CIFDAQ
 
IT Runs Better with ThousandEyes AI-driven Assurance
ThousandEyes
 
Google I/O Extended 2025 Baku - all ppts
HusseinMalikMammadli
 
The-Ethical-Hackers-Imperative-Safeguarding-the-Digital-Frontier.pptx
sujalchauhan1305
 
A Day in the Life of Location Data - Turning Where into How.pdf
Precisely
 
How-Cloud-Computing-Impacts-Businesses-in-2025-and-Beyond.pdf
Artjoker Software Development Company
 
Cloud-Migration-Best-Practices-A-Practical-Guide-to-AWS-Azure-and-Google-Clou...
Artjoker Software Development Company
 
Structs to JSON: How Go Powers REST APIs
Emily Achieng
 
Event Presentation Google Cloud Next Extended 2025
minhtrietgect
 
Brief History of Internet - Early Days of Internet
sutharharshit158
 
A Strategic Analysis of the MVNO Wave in Emerging Markets.pdf
IPLOOK Networks
 
The Evolution of KM Roles (Presented at Knowledge Summit Dublin 2025)
Enterprise Knowledge
 
Economic Impact of Data Centres to the Malaysian Economy
flintglobalapac
 
Dev Dives: Automate, test, and deploy in one place—with Unified Developer Exp...
AndreeaTom
 
Ad

Natural language procesing in R