SlideShare a Scribd company logo
Building(Hadoop(Data(Applica;ons(with(Kite(

Headline(Goes(Here(

Tom(White(@tom_e_white(
Speaker(Name(or(Subhead(Goes(Here(
The(Hive,(February(18,(2014(

1
Hadoop(0.1
(

% cat bigdata.txt | hadoop fs -put - in!
% hadoop MyJob in out!
% hadoop fs -get out!

2
Characteris;cs
(
•  Batch(applica;ons(only(
•  LowNlevel(coding(
•  File(format(
•  Serializa;on(
•  Par;;oning(scheme(

3
A(Hadoop(Stack
(

4
Applica;ons
(
•  [Batch](Analyze(an(archive(of(songs1(
•  [Interac;ve(SQL](Ad(hoc(queries(on(recommenda;ons(from(

social(media(applica;ons2(
•  [Search](Searching(email(traffic(in(nearNreal;me3(
•  [ML](Detec;ng(fraudulent(transac;ons(using(clustering4(

5

[1](hZp://blog.cloudera.com/blog/2012/08/processNaNmillionNsongsNwithNapacheNpig/((
[2](hZp://blog.cloudera.com/blog/2014/01/howNwajamNanswersNbusinessNques;onsNfasterNwithNhadoop/((
[3](hZp://blog.cloudera.com/blog/2013/09/emailNindexingNusingNclouderaNsearch/((
[4](hZp://blog.cloudera.com/blog/2013/03/cloudera_ml_data_science_tools/((
Outline
(
•  A(Typical(Applica;on(
•  Kite(SDK(
•  An(Example(
•  Advanced(Kite(
•  Conclusion(
•  Ques;ons(

6
A(typical(applica;on((zoom(100:1)
(

7
A(typical(applica;on((zoom(10:1)
(

8
A(typical(pipeline((zoom(5:1)
(

9
Kite(SDK
(

10
Kite(Codifies(Best(Prac;ce(as(APIs,(Tools,(Docs(
and(Examples
(

11
Kite
(
•  A(clientNside(library(for(wri;ng(Hadoop(Data(Applica;ons(
•  First(release(was(in(April(2013(as(CDK(
•  0.11.0(earlier(this(month(
•  Open(source,(Apache(2(license,(kitesdk.org(
•  Modular(
•  Data(module((HDFS,(Flume,(Crunch,(Hive,(HBase)(
•  Morphlines(transforma;on(module(
•  Maven(plugin(
12
An(Example
(

13
Kite(Data(Module
(
•  Dataset(–(a(collec;on(of(en;;es(
•  DatasetRepository(–(physical(storage(loca;on(for(datasets(
•  DatasetDescriptor(–(holds(dataset(metadata((schema,(format)(
•  DatasetWriter(–(write(en;;es(to(a(dataset(in(a(stream(
•  DatasetReader(–(read(en;;es(from(a(dataset((

14
1.(Define(the(Event(En;ty
(
public class Event {!
private long id;!
private long timestamp;!
private String source;!
// getters and setters!
}!

15
2.(Create(the(Events(Dataset
(
DatasetRepository repo =
DatasetRepositories.open("repo:hive");!
DatasetDescriptor descriptor =!
new DatasetDescriptor.Builder()!
.schema(Event.class).build();!
repo.create("events", descriptor);!

16
(2.(or(with(the(Maven(plugin)
(
$ mvn kite:create-dataset !
-Dkite.repositoryUri='repo:hive' !
-Dkite.datasetName=events !
-Dkite.avroSchemaReflectClass=com.example.Event!

17
A(peek(at(the(Avro(schema
(
$ hive -e "DESCRIBE EXTENDED events"!
...!
{!
"type" : "record",!
"name" : "Event",!
"namespace" : "com.example",!
"fields" : [!
{ "name" : "id", "type" : "long" },!
{ "name" : "timestamp", "type" : "long" },!
{ "name" : "source", "type" : "string" }!
]!
18

}!
3.(Write(Events
(
Logger logger = Logger.getLogger(...);!
Event event = new Event();!
event.setId(id);!
event.setTimestamp(System.currentTimeMillis());!
event.setSource(source);!
logger.info(event);!

19
Log4j(configura;on
(
log4j.appender.flume =
org.kitesdk.data.flume.Log4jAppender!
log4j.appender.flume.Hostname = localhost!
log4j.appender.flume.Port = 41415!
log4j.appender.flume.DatasetRepositoryUri = repo:hive
!
log4j.appender.flume.DatasetName = events!

20
The(resul;ng(file(layout
(
/user!
/hive!
/warehouse!
/events!
/FlumeData.1375659013795!
/FlumeData.1375659013796!

21

Avro(
files(
4.(Generate(Summaries(with(Crunch
(
PCollection<Event> events =
read(asSource(repo.load("events"), Event.class));!
PCollection<Summary> summaries = events!
.by(new GetTimeBucket(), // minute of day, source
!
Avros.pairs(Avros.longs(), Avros.strings()))!
.groupByKey()!
.parallelDo(new MakeSummary(),!
Avros.reflects(Summary.class));!
22

write(summaries, asTarget(repo.load("summaries"))!
…(and(run(using(Maven
(
$ mvn kite:create-dataset -Dkite.datasetName=summaries ...!
<plugin>!
<groupId>org.kitesdk</groupId>!
<artifactId>kite-maven-plugin</artifactId>!
<configuration>!
<toolClass>com.example.GenerateSummaries</toolClass>!
</configuration>!
</plugin>!

23

$ mvn kite:run-tool!
5.(Query(with(Impala
(
$ impala-shell -q ’DESCRIBE events'!
+-----------+--------+-------------------+!
| name
| type
| comment
|!
+-----------+--------+-------------------+!
| id
| bigint | from deserializer |!
| timestamp | bigint | from deserializer |!
| source

| string | from deserializer |!

+-----------+--------+-------------------+!
24
…(Ad(Hoc(Queries
(
$ impala-shell -q 'SELECT source, COUNT(1) AS cnt
FROM events GROUP BY source'!
+--------------------------------------+-----+!
| source
| cnt |!
+--------------------------------------+-----+!
| 018dc1b6-e6b0-489e-bce3-115917e00632 | 38 |!
| bc80040e-09d1-4ad2-8bd8-82afd1b8431a | 85 |!
+--------------------------------------+-----+!
Returned 2 row(s) in 0.56s!
25
…(or(use(JDBC
(
Class.forName("org.apache.hive.jdbc.HiveDriver");!
Connection connection = DriverManager.getConnection(!
"jdbc:hive2://localhost:21050/;auth=noSasl");!
Statement statement = connection.createStatement();!
ResultSet resultSet = statement.executeQuery(!
"SELECT * FROM summaries");!
26
Advanced(Kite
(

27
Unified(Storage(Interface
(

•  Dataset(–(streaming(access,(HDFS(storage(
•  RandomAccessDataset(–(random(access,(HBase(storage(
•  Par;;onStrategy(defines(how(to(map(an(en;ty(to(par;;ons(in(

HDFS(or(row(keys(in(HBase(

28
Filesystem(Par;;ons
(
PartitionStrategy p = new PartitionStrategy.Builder()
!
.year("timestamp")!
.month("timestamp")!
.day("timestamp").build();!
/user/hive/warehouse/events!
/year=2014/month=02/day=08!
/FlumeData.1375659013795!
/FlumeData.1375659013796!
29
HBase(Keys:(Defined(in(Avro
(
{!
"name": "username",!
"type": "string",!
"mapping": { "type": "key", "value": "0" }!
},!
{!
"name": "favoriteColor",!
"type": "string",!
"mapping": { "type": "column", "value": "meta:fc" }
!
}!
30
Random(Access(Dataset:(Crea;on
(
RandomAccessDatasetRepository repo =
DatasetRepositories.openRandomAccess(!
"repo:hbase:localhost");!
RandomAccessDataset<User> users = repo.load("users");
!
users.put(new User("bill", "green"));!
users.put(new User("alice", "blue"));!

31
Random(Access(Dataset:(Retrieval
(
Key key = new Key.Builder(users)!
.add("username", "bill").build();!
User bill = users.get(key);!

32
Views
(
View<User> view = users.from("username", "bill");!
DatasetReader<User> reader = view.newReader();!
reader.open();!
for (User user : reader) {!
System.out.println(user);!
}!
reader.close();!

33
Parallel(Processing
(
•  Goal(is(for(Hadoop(processing(frameworks(to(“just(work”(
•  Support(Formats,(Par;;ons,(Views(
•  Na;ve(Kite(components,(e.g.(DatasetOutputFormat(for(MR(
HDFS%Dataset%
Crunch(
MapReduce(
Impala(

34

HBase%Dataset%

Yes(

0.12.0(

0.12.0(

0.12.0(

Yes(

Planned(
Schema(Evolu;on
(
public class Event {!
private long id;!
private long timestamp;!
private String source;!
@Nullable private String ipAddress;!
}!
$ mvn kite:update-dataset !
-Dkite.datasetName=events !
-Dkite.avroSchemaReflectClass=com.example.Event!
35
Searchable(Datasets
(
•  Use(Flume(Solr(Sink((in(

addi;on(to(HDFS(Sink)(
•  Morphlines(library(to(define(
fields(to(index(
•  SolrCloud(runs(on(cluster(from(
indexes(in(HDFS(
•  Future(support(in(Kite(to(index(
selected(fields(automa;cally(
36
Conclusion
(

37
Kite(makes(it(easy(to(get(data(into(Hadoop(
with(a(flexible(schema(model(that(is(storage(
agnos;c(in(a(format(that(can(be(processed(
with(a(wide(range(of(Hadoop(tools
(

38
Gepng(Started(With(Kite
(
•  Examples(at(github.com/kiteNsdk/kiteNexamples(
•  Working(with(streaming(and(randomNaccess(datasets(
•  Logging(events(to(datasets(from(a(webapp(
•  Running(a(periodic(job(
•  Migra;ng(data(from(CSV(to(a(Kite(dataset(
•  Conver;ng(an(Avro(dataset(to(a(Parquet(dataset(
•  Wri;ng(and(configuring(Morphlines(
•  Using(Morphlines(to(write(JSON(records(to(a(dataset(
39
Ques;ons?
(
kitesdk.org
(
@tom_e_white
(
tom@cloudera.com
(
40
41
About(me
(
•  Engineer(at(Cloudera(working(

on(Core(Hadoop(and(Kite(
•  Apache(Hadoop(CommiZer,(
PMC(Member,(Apache(Member(
•  Author(of((
“Hadoop:(The(Defini;ve(Guide”(

42
Morphlines(Example
(

morphlines(:([(
({(
(((id(:(morphline1(
(((importCommands(:(["com.cloudera.**",("org.apache.solr.**"](
(((commands(:([(
((((({(readLine({}(}(((((((((((((((((((((
((((({((
(((((((grok({((
(((((((((dic;onaryFiles(:([/tmp/grokNdic;onaries]((((((((((((((((((((((((((((((((
(((((((((expressions(:({((
(((((((((((message(:("""<%{POSINT:syslog_pri}>%{SYSLOGTIMESTAMP:syslog_;mestamp}(%
{SYSLOGHOST:syslog_hostname}(%{DATA:syslog_program}(?:[%{POSINT:syslog_pid}])?:(%
{GREEDYDATA:syslog_message}"""(
Example Input!
<164>Feb  4 10:46:14 syslog sshd[607]: listening on 0.0.0.0 port 22!
(((((((((}(
Output Record!
(((((((}(
syslog_pri:164!
(((((}(
syslog_timestamp:Feb  4 10:46:14!
((((({(loadSolr({}(}((((((
syslog_hostname:syslog!
syslog_program:sshd!
((((](
syslog_pid:607!
(}(
syslog_message:listening on 0.0.0.0 port 22.!
43 ](
Apps
(
•  App(–(a(packaged(Java(program(that(runs(on(a(Hadoop(cluster(
•  cdk:packageNapp(–(create(a(package(on(the(local(filesystem(
•  like(an(exploded(WAR(
•  Oozie(format(
•  cdk:deployNapp(–(copy(packaged(app(to(HDFS(
•  cdk:runNapp(–(execute(the(app(
•  Workflow(app(–(runs(once(
•  Coordinator(app(–(runs(other(apps((like(cron)(
44

More Related Content

What's hot (9)

PPTX
Hadoop Pig
Madhur Nawandar
 
PDF
houGh documentation
Miroljub Anastasov
 
PDF
Fast track to getting started with DSE Max @ ING
Duyhai Doan
 
PDF
Recognize Godzilla
隊長 アイパー
 
PPTX
Cassandra Summit - What's New In Apache TinkerPop?
Stephen Mallette
 
PDF
Next Generation Programming in R
Florian Uhlitz
 
PDF
Python for R Users
Ajay Ohri
 
PPTX
Access pattern of tags
Harish Chetty
 
PDF
Podlove Podcast Validator
Lars Windauer
 
Hadoop Pig
Madhur Nawandar
 
houGh documentation
Miroljub Anastasov
 
Fast track to getting started with DSE Max @ ING
Duyhai Doan
 
Recognize Godzilla
隊長 アイパー
 
Cassandra Summit - What's New In Apache TinkerPop?
Stephen Mallette
 
Next Generation Programming in R
Florian Uhlitz
 
Python for R Users
Ajay Ohri
 
Access pattern of tags
Harish Chetty
 
Podlove Podcast Validator
Lars Windauer
 

Viewers also liked (20)

PPT
Big Data, Security Intelligence, (And Why I Hate This Title)
Coastal Pet Products, Inc.
 
PPTX
Building hadoop based big data environment
Evans Ye
 
PPTX
Hdp security overview
Hortonworks
 
PPTX
Kerberos, Token and Hadoop
Kai Zheng
 
DOCX
REAL-TIME BIG DATA ANALYTICAL ARCHITECTURE FOR REMOTE SENSING APPLICATION
I3E Technologies
 
PDF
Smart Analytics For The Utility Sector
Herman Bosker
 
PDF
Big Data Security Intelligence and Analytics for Advanced Threat Protection
Blue Coat
 
PDF
Open-BDA - Big Data Hadoop Developer Training 10th & 11th June
Innovative Management Services
 
PPTX
Big Data, Big Content, and Aligning Your Storage Strategy
Hitachi Vantara
 
PPT
Mr. satish kumar, schnieder electric
Rohan Pinto
 
PDF
Open-BDA Hadoop Summt 2014 - Post Summit Report
Innovative Management Services
 
PDF
Demystify big data data science
Mahesh Kumar CV
 
PPTX
Generating Insight from Big Data in Energy and the Environment
David Wallom
 
PPTX
Hadoop security
Shivaji Dutta
 
PPTX
Hadoop Security Today & Tomorrow with Apache Knox
Vinay Shukla
 
PDF
Hadoop Ecosystem Architecture Overview
Senthil Kumar
 
PPTX
"Big Data" in the Energy Industry
Paige Bailey
 
PDF
Real time big data analytical architecture for remote sensing application
LeMeniz Infotech
 
PDF
Big Data Security and Governance
DataWorks Summit/Hadoop Summit
 
Big Data, Security Intelligence, (And Why I Hate This Title)
Coastal Pet Products, Inc.
 
Building hadoop based big data environment
Evans Ye
 
Hdp security overview
Hortonworks
 
Kerberos, Token and Hadoop
Kai Zheng
 
REAL-TIME BIG DATA ANALYTICAL ARCHITECTURE FOR REMOTE SENSING APPLICATION
I3E Technologies
 
Smart Analytics For The Utility Sector
Herman Bosker
 
Big Data Security Intelligence and Analytics for Advanced Threat Protection
Blue Coat
 
Open-BDA - Big Data Hadoop Developer Training 10th & 11th June
Innovative Management Services
 
Big Data, Big Content, and Aligning Your Storage Strategy
Hitachi Vantara
 
Mr. satish kumar, schnieder electric
Rohan Pinto
 
Open-BDA Hadoop Summt 2014 - Post Summit Report
Innovative Management Services
 
Demystify big data data science
Mahesh Kumar CV
 
Generating Insight from Big Data in Energy and the Environment
David Wallom
 
Hadoop security
Shivaji Dutta
 
Hadoop Security Today & Tomorrow with Apache Knox
Vinay Shukla
 
Hadoop Ecosystem Architecture Overview
Senthil Kumar
 
"Big Data" in the Energy Industry
Paige Bailey
 
Real time big data analytical architecture for remote sensing application
LeMeniz Infotech
 
Big Data Security and Governance
DataWorks Summit/Hadoop Summit
 
Ad

Similar to Building Hadoop Data Applications with Kite by Tom White (20)

PDF
Building Hadoop Data Applications with Kite
huguk
 
PPTX
מיכאל
sqlserver.co.il
 
PPT
Hadoop - Introduction to Hadoop
Vibrant Technologies & Computers
 
PPTX
Conexión de MongoDB con Hadoop - Luis Alberto Giménez - CAPSiDE #DevOSSAzureDays
CAPSiDE
 
PDF
OC Big Data Monthly Meetup #5 - Session 1 - Altiscale
Big Data Joe™ Rossi
 
PDF
20081030linkedin
Jeff Hammerbacher
 
PPT
Brust hadoopecosystem
Andrew Brust
 
PDF
Big Data @ Orange - Dev Day 2013 - part 2
ovarene
 
PPTX
Pig on Storm
DataWorks Summit
 
PDF
Hadoop in Data Warehousing
Alexey Grigorev
 
PPTX
Hive - Apache hadoop Bigdata training by Desing Pathshala
Desing Pathshala
 
PPTX
Hadoop Training in Hyderabad
Rajitha D
 
PPTX
Hadoop Training in Hyderabad
CHENNAKESHAVAKATAGAR
 
PPTX
Introduction to HiveQL
kristinferrier
 
PPTX
Expand data analysis tool at scale with Zeppelin
DataWorks Summit
 
PDF
AWS re:Invent re:Cap - 데이터 분석: Amazon EC2 C4 Instance + Amazon EBS - 김일호
Amazon Web Services Korea
 
PPTX
Apache Hive
Ajit Koti
 
PDF
Beginning hive and_apache_pig
Mohamed Ali Mahmoud khouder
 
PPTX
De-Bugging Hive with Hadoop-in-the-Cloud
DataWorks Summit
 
PPTX
Debugging Hive with Hadoop-in-the-Cloud
Soam Acharya
 
Building Hadoop Data Applications with Kite
huguk
 
מיכאל
sqlserver.co.il
 
Hadoop - Introduction to Hadoop
Vibrant Technologies & Computers
 
Conexión de MongoDB con Hadoop - Luis Alberto Giménez - CAPSiDE #DevOSSAzureDays
CAPSiDE
 
OC Big Data Monthly Meetup #5 - Session 1 - Altiscale
Big Data Joe™ Rossi
 
20081030linkedin
Jeff Hammerbacher
 
Brust hadoopecosystem
Andrew Brust
 
Big Data @ Orange - Dev Day 2013 - part 2
ovarene
 
Pig on Storm
DataWorks Summit
 
Hadoop in Data Warehousing
Alexey Grigorev
 
Hive - Apache hadoop Bigdata training by Desing Pathshala
Desing Pathshala
 
Hadoop Training in Hyderabad
Rajitha D
 
Hadoop Training in Hyderabad
CHENNAKESHAVAKATAGAR
 
Introduction to HiveQL
kristinferrier
 
Expand data analysis tool at scale with Zeppelin
DataWorks Summit
 
AWS re:Invent re:Cap - 데이터 분석: Amazon EC2 C4 Instance + Amazon EBS - 김일호
Amazon Web Services Korea
 
Apache Hive
Ajit Koti
 
Beginning hive and_apache_pig
Mohamed Ali Mahmoud khouder
 
De-Bugging Hive with Hadoop-in-the-Cloud
DataWorks Summit
 
Debugging Hive with Hadoop-in-the-Cloud
Soam Acharya
 
Ad

More from The Hive (20)

PDF
"Responsible AI", by Charlie Muirhead
The Hive
 
PPTX
Translating a Trillion Points of Data into Therapies, Diagnostics, and New In...
The Hive
 
PDF
Digital Transformation; Digital Twins for Delivering Business Value in IIoT
The Hive
 
PDF
Quantum Computing (IBM Q) - Hive Think Tank Event w/ Dr. Bob Sutor - 02.22.18
The Hive
 
PPTX
The Hive Think Tank: Rendezvous Architecture Makes Machine Learning Logistics...
The Hive
 
PDF
Data Science in the Enterprise
The Hive
 
PDF
AI in Software for Augmenting Intelligence Across the Enterprise
The Hive
 
PPTX
“ High Precision Analytics for Healthcare: Promises and Challenges” by Sriram...
The Hive
 
PPTX
"The Future of Manufacturing" by Sujeet Chand, SVP&CTO, Rockwell Automation
The Hive
 
PPTX
Social Impact & Ethics of AI by Steve Omohundro
The Hive
 
PDF
The Hive Think Tank: AI in The Enterprise by Venkat Srinivasan
The Hive
 
PDF
The Hive Think Tank: Machine Learning Applications in Genomics by Prof. Jian ...
The Hive
 
PDF
The Hive Think Tank: The Future Of Customer Support - AI Driven Automation
The Hive
 
PPTX
The Hive Think Tank: Talk by Mohandas Pai - India at 2030, How Tech Entrepren...
The Hive
 
PDF
The Hive Think Tank: The Content Trap - Strategist's Guide to Digital Change
The Hive
 
PPTX
Deep Visual Understanding from Deep Learning by Prof. Jitendra Malik
The Hive
 
PDF
The Hive Think Tank: Heron at Twitter
The Hive
 
PPTX
The Hive Think Tank: Unpacking AI for Healthcare
The Hive
 
PPTX
The Hive Think Tank: Translating IoT into Innovation at Every Level by Prith ...
The Hive
 
PDF
The Hive Think Tank - The Microsoft Big Data Stack by Raghu Ramakrishnan, CTO...
The Hive
 
"Responsible AI", by Charlie Muirhead
The Hive
 
Translating a Trillion Points of Data into Therapies, Diagnostics, and New In...
The Hive
 
Digital Transformation; Digital Twins for Delivering Business Value in IIoT
The Hive
 
Quantum Computing (IBM Q) - Hive Think Tank Event w/ Dr. Bob Sutor - 02.22.18
The Hive
 
The Hive Think Tank: Rendezvous Architecture Makes Machine Learning Logistics...
The Hive
 
Data Science in the Enterprise
The Hive
 
AI in Software for Augmenting Intelligence Across the Enterprise
The Hive
 
“ High Precision Analytics for Healthcare: Promises and Challenges” by Sriram...
The Hive
 
"The Future of Manufacturing" by Sujeet Chand, SVP&CTO, Rockwell Automation
The Hive
 
Social Impact & Ethics of AI by Steve Omohundro
The Hive
 
The Hive Think Tank: AI in The Enterprise by Venkat Srinivasan
The Hive
 
The Hive Think Tank: Machine Learning Applications in Genomics by Prof. Jian ...
The Hive
 
The Hive Think Tank: The Future Of Customer Support - AI Driven Automation
The Hive
 
The Hive Think Tank: Talk by Mohandas Pai - India at 2030, How Tech Entrepren...
The Hive
 
The Hive Think Tank: The Content Trap - Strategist's Guide to Digital Change
The Hive
 
Deep Visual Understanding from Deep Learning by Prof. Jitendra Malik
The Hive
 
The Hive Think Tank: Heron at Twitter
The Hive
 
The Hive Think Tank: Unpacking AI for Healthcare
The Hive
 
The Hive Think Tank: Translating IoT into Innovation at Every Level by Prith ...
The Hive
 
The Hive Think Tank - The Microsoft Big Data Stack by Raghu Ramakrishnan, CTO...
The Hive
 

Building Hadoop Data Applications with Kite by Tom White