使用SparkonYarn处理HDFS数据,-CSDN博客

本文链接：https://blog.csdn.net/RayMand168/article/details/131105129

上传程序到yarn

# 执行的前提是初始化Sparkconf时,不设置setmaster方法,textFile方法里面的参数使用hdfs地址
/export/server/spark/bin/spark-submit --master yarn --py-files defs.py /root/main.py

数据类型

00:00:00	2982199073774412	传智播客	8	3	http://www.itcast.cn
00:00:00	07594220010824798	黑马程序员	1	1	http://www.itcast.cn
00:00:00	5228056822071097	传智播客	14	5	http://www.itcast.cn
00:00:00	6140463203615646	博学谷	62	36	http://www.itcast.cn
00:00:00	8561366108033201	IDEA	3	2	http://www.itcast.cn
00:00:00	23908140386148713	传智专修学院	1	2	http://www.itcast.cn
23:00:00	1797943298449139	flume	8	5	http://www.itcast.cn
23:00:00	00717725924582846	itcast	1	2	http://www.itcast.cn
23:00:00	41416219018952116	bigdata	2	6	http://www.itcast.cn
23:00:00	9975666857142764	IDEA	2	2	http://www.itcast.cn
23:00:00	21603374619077448	酷丁鱼	1	6	http://www.itcast.cn
23:00:00	7423866288265172	bigdata	3	13	http://www.itcast.cn
23:00:00	0616877776407358	itcast	2	9	http://www.itcast.cn
23:00:00	3933365481995287	flume	6	3	http://www.itcast.cn
23:00:00	8242389147671512	数据仓库	2	3	http://www.itcast.cn
23:00:00	8248403977107859	传智汇	1	1	http://www.itcast.cn
23:00:00	6239728533154822	itheima	7	9	http://www.itcast.cn
23:00:00	6551182914925117	itcast	6	4	http://www.itcast.cn
23:00:00	2345161545038265	传智汇	2	1	http://www.itcast.cn
23:00:00	06478452674857782	hadoop	4	1	http://www.itcast.cn
23:00:00	23405445793591878	博学谷	4	5	http://www.itcast.cn
23:00:00	23457845712802688	hadoop	4	1	http://www.itcast.cn
23:00:00	4625224675315291	spark	2	6	http://www.itcast.cn
23:00:00	048731119953599966	itheima	10	44	http://www.itcast.cn
23:00:00	48530765688455246	hadoop	26	3	http://www.itcast.cn
23:00:00	5750662932822584	酷丁鱼	3	2	http://www.itcast.cn
23:00:00	11515839301781111	spark	4	3	http://www.itcast.cn
23:00:00	26706283607513126	java	5	6	http://www.itcast.cn

main.py

from pyspark import SparkConf, SparkContext, StorageLevel
from defs import jieba_handle, file_suazi, append_word, handle_split
from operator import add

if __name__ == '__main__':
    # 1.通过sparkcof创建conf对象
    conf = SparkConf().setAppName('test').setMaster('local[*]')
    # 2.生成sc对象
    sc = SparkContext(conf=conf)

    word_field = sc.textFile('../data/input/SogouQ.txt')
    split_field = word_field.map(lambda x: x.split('\t'))
    split_field.persist(storageLevel=StorageLevel.MEMORY_ONLY)
    resig_word = split_field.map(lambda x: x[2])

    word = resig_word.flatMap(jieba_handle)
    # print(word.collect())
    # 对内容进行过滤
    filter_rdd = word.filter(file_suazi)
    # 对内容进行统计
    append_rdd = filter_rdd.map(append_word)
	
    result = append_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False, numPartitions=1).take(5)
    print('需求1:', result)

    map_rdd = split_field.map(lambda x: (x[1], x[2]))
    split_rdd = map_rdd.flatMap(handle_split)
    result2 = split_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False, numPartitions=1).take(5)
    print('需求2:', result2)

    flatmap_rdd = split_field.map(lambda x: (x[0].split(':')[0], 1))
    result3 = flatmap_rdd.reduceByKey(add).sortBy(lambda x: x[1], ascending=False, numPartitions=1).take(5)
    print('需求3:', result3)

defs.py


import jieba
def jieba_handle(data):
    result = jieba.cut_for_search(data,HMM=True)
    return result

def file_suazi(data):
    return data not in ['谷','帮','客']

def append_word(data):
    if data == '传智播':data='传智播客'
    if data == '院校':data='校园帮'
    if data == '博学':data='博学谷'
    return (data,1)

def handle_split(data):
    id = data[0]
    word = data[1]
    jieba_words = jieba_handle(word)
    return_list = list()
    for word in jieba_words:
        if file_suazi(word):
            result = (id+'_'+append_word(word)[0],1)
            return_list.append(result)
    return return_list