上传程序到yarn
/export/server/spark/bin/spark- submit - - master yarn - - py- files defs.py /root/main.py
数据类型
00: 00: 00 2982199073774412 传智播客 8 3 http: //www.itcast.cn
00: 00: 00 07594220010824798 黑马程序员 1 1 http: //www.itcast.cn
00: 00: 00 5228056822071097 传智播客 14 5 http: //www.itcast.cn
00: 00: 00 6140463203615646 博学谷 62 36 http: //www.itcast.cn
00: 00: 00 8561366108033201 IDEA 3 2 http: //www.itcast.cn
00: 00: 00 23908140386148713 传智专修学院 1 2 http: //www.itcast.cn
23: 00: 00 1797943298449139 flume 8 5 http: //www.itcast.cn
23: 00: 00 00717725924582846 itcast 1 2 http: //www.itcast.cn
23: 00: 00 41416219018952116 bigdata 2 6 http: //www.itcast.cn
23: 00: 00 9975666857142764 IDEA 2 2 http: //www.itcast.cn
23: 00: 00 21603374619077448 酷丁鱼 1 6 http: //www.itcast.cn
23: 00: 00 7423866288265172 bigdata 3 13 http: //www.itcast.cn
23: 00: 00 0616877776407358 itcast 2 9 http: //www.itcast.cn
23: 00: 00 3933365481995287 flume 6 3 http: //www.itcast.cn
23: 00: 00 8242389147671512 数据仓库 2 3 http: //www.itcast.cn
23: 00: 00 8248403977107859 传智汇 1 1 http: //www.itcast.cn
23: 00: 00 6239728533154822 itheima 7 9 http: //www.itcast.cn
23: 00: 00 6551182914925117 itcast 6 4 http: //www.itcast.cn
23: 00: 00 2345161545038265 传智汇 2 1 http: //www.itcast.cn
23: 00: 00 06478452674857782 hadoop 4 1 http: //www.itcast.cn
23: 00: 00 23405445793591878 博学谷 4 5 http: //www.itcast.cn
23: 00: 00 23457845712802688 hadoop 4 1 http: //www.itcast.cn
23: 00: 00 4625224675315291 spark 2 6 http: //www.itcast.cn
23: 00: 00 048731119953599966 itheima 10 44 http: //www.itcast.cn
23: 00: 00 48530765688455246 hadoop 26 3 http: //www.itcast.cn
23: 00: 00 5750662932822584 酷丁鱼 3 2 http: //www.itcast.cn
23: 00: 00 11515839301781111 spark 4 3 http: //www.itcast.cn
23: 00: 00 26706283607513126 java 5 6 http: //www.itcast.cn
main.py
from pyspark import SparkConf, SparkContext, StorageLevel
from defs import jieba_handle, file_suazi, append_word, handle_split
from operator import add
if __name__ == '__main__' :
conf = SparkConf().setAppName('test').setMaster('local[ *] ')
sc = SparkContext(conf=conf)
word_field = sc.textFile('../data/input/SogouQ.txt')
split_field = word_field.map(lambda x : x.split('\t'))
split_field.persist(storageLevel=StorageLevel.MEMORY_ONLY)
resig_word = split_field.map(lambda x : x[ 2 ] )
word = resig_word.flatMap(jieba_handle)
filter_rdd = word.filter(file_suazi)
append_rdd = filter_rdd.map(append_word)
result = append_rdd.reduceByKey(lambda a, b : a + b).sortBy(lambda x : x[ 1 ] , ascending=False, numPartitions=1).take(5)
print('需求1: ', result)
map_rdd = split_field.map(lambda x : (x[ 1 ] , x[ 2 ] ))
split_rdd = map_rdd.flatMap(handle_split)
result2 = split_rdd.reduceByKey(lambda a, b : a + b).sortBy(lambda x : x[ 1 ] , ascending=False, numPartitions=1).take(5)
print('需求2: ', result2)
flatmap_rdd = split_field.map(lambda x : (x[ 0 ] .split(': ')[ 0 ] , 1))
result3 = flatmap_rdd.reduceByKey(add).sortBy(lambda x : x[ 1 ] , ascending=False, numPartitions=1).take(5)
print('需求3: ', result3)
defs.py
import jieba
def jieba_handle(data) :
result = jieba.cut_for_search(data, HMM=True)
return result
def file_suazi(data) :
return data not in [ '谷' , '帮' , '客' ]
def append_word(data) :
if data == '传智播': data='传智播客'
if data == '院校': data='校园帮'
if data == '博学': data='博学谷'
return (data, 1)
def handle_split(data) :
id = data[ 0 ]
word = data[ 1 ]
jieba_words = jieba_handle(word)
return_list = list()
for word in jieba_words :
if file_suazi(word) :
result = (id+'_'+append_word(word)[ 0 ] , 1)
return_list.append(result)
return return_list