spark安装

最新推荐文章于 2025-04-03 14:17:08 发布

大雄ctu

最新推荐文章于 2025-04-03 14:17:08 发布

阅读量544

点赞数

CC 4.0 BY-SA版权

分类专栏：大数据文章标签： spark

本文链接：https://blog.csdn.net/zhaoyaxiong_ctu/article/details/113727616

大数据专栏收录该内容

3 篇文章

订阅专栏

spark安装

1 安装scala

安装scala 2.11.8版本，spark-2.3.2版本，分别解压到该目录下，并重命名

[root@hadoop1 examples]# cd /usr/local/
[root@hadoop1 local]# ls
bin  games   hbase  include  lib64    nginx   Python-3.8.0      sbin   scala-2.11.8.tgz  spark                          sqoop                                       src
etc  hadoop  hive   lib      libexec  python  Python-3.8.0.tgz  scala  share             spark-2.3.2-bin-hadoop2.7.tgz  sqoop-1.4.4.bin__hadoop-2.0.4-alpha.tar.gz  zookeeper

2 修改环境变量

修改环境变量，增加scala和spark


vim /etc/profile

export SCALA_HOME=/usr/local/scala
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin:$HBASE_HOME/sbin:$HIVE_HOME/bin:$HIVE_HOME/sbin:$SQOOP_HOME/bin:$SCALA_HOME/bin:$SPARK_HOME/bin


source /etc/profile

3 修改spark配置

[root@hadoop1 spark]# cd /usr/local/spark/conf/
[root@hadoop1 conf]# ls
docker.properties.template  log4j.properties.template    slaves           spark-defaults.conf           spark-env.sh
fairscheduler.xml.template  metrics.properties.template  slaves.template  spark-defaults.conf.template  spark-env.sh.template



[root@hadoop1 conf]# cp slaves.template slaves
[root@hadoop1 conf]# cp spark-env.sh.template spark-env.sh
[root@hadoop1 conf]# cp spark-defaults.conf.template spark-defaults.conf

[root@hadoop1 conf]# cat spark-env.sh
#!/usr/bin/env bash

export JAVA_HOME=/usr/java/jdk1.8.0_271-amd64
export SCALA_HOME=/usr/local/scala
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_MASTER_HOST=hadoop1
export SPARK_PID_DIR=/usr/local/spark/data/pid
export SPARK_LOCAL_DIRS=/usr/local/spark/data/spark_shuffle
export SPARK_EXECUTOR_MEMORY=2G
export SPARK_WORKER_MEMORY=8G
[root@hadoop1 conf]#

[root@hadoop1 conf]# cat spark-defaults.conf
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.

# Example:
spark.master                     spark://hadoop1:7077
spark.eventLog.enabled           true
spark.eventLog.dir               hdfs://hadoop1:9000/eventLog
spark.serializer                 org.apache.spark.serializer.KryoSerializer
spark.driver.memory              2g
# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
[root@hadoop1 conf]#

#增加计算节点
[root@hadoop1 conf]# cat slaves
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# A Spark Worker will be started on each of the machines listed below.
hadoop2
hadoop3
[root@hadoop1 conf]#


#分发scala, spark分别到从机hadoop2,hadoop3，并修改/etc/profile文件
[root@hadoop1 conf]# scp /usr/local/scala hadoop2:/usr/local/
[root@hadoop1 conf]# scp /usr/local/scala hadoop3:/usr/local/
[root@hadoop1 conf]# scp /usr/local/spark hadoop2:/usr/local/
[root@hadoop1 conf]# scp /usr/local/spark hadoop3:/usr/local/

4 测试Spark

#启动hadoop
[root@hadoop1 conf]# start-all.sh
Starting namenodes on [hadoop1]
Last login: Tue Feb  2 08:55:23 CST 2021 from 10.2.33.165 on pts/0
hadoop1: namenode is running as process 19374.  Stop it first.
Starting datanodes
Last login: Tue Feb  2 11:28:41 CST 2021 on pts/0
hadoop3: datanode is running as process 9056.  Stop it first.
hadoop2: datanode is running as process 7838.  Stop it first.
hadoop1: datanode is running as process 19541.  Stop it first.
Starting secondary namenodes [hadoop1]
Last login: Tue Feb  2 11:28:42 CST 2021 on pts/0
hadoop1: secondarynamenode is running as process 19817.  Stop it first.
Starting resourcemanager
Last login: Tue Feb  2 11:28:44 CST 2021 on pts/0
resourcemanager is running as process 20121.  Stop it first.
Starting nodemanagers
Last login: Tue Feb  2 11:28:50 CST 2021 on pts/0
hadoop2: nodemanager is running as process 7992.  Stop it first.
hadoop3: nodemanager is running as process 9205.  Stop it first.
hadoop1: nodemanager is running as process 20294.  Stop it first.

#启动spark
[root@hadoop1 conf]# /usr/local/spark/sbin/start-all.sh
org.apache.spark.deploy.master.Master running as process 31886.  Stop it first.
hadoop3: org.apache.spark.deploy.worker.Worker running as process 28606.  Stop it first.
hadoop2: org.apache.spark.deploy.worker.Worker running as process 25751.  Stop it first.
[root@hadoop1 conf]#

#进入examples，查找计算PI程序
[root@hadoop1 conf]# cd /usr/local/spark/examples/src/main/
java/      python/    r/         resources/ scala/
[root@hadoop1 conf]# cd /usr/local/spark/examples/src/main/scala/org/apache/spark/examples/
[root@hadoop1 examples]# ls
BroadcastTest.scala          graphx             LocalFileLR.scala  LogQuery.scala            pythonconverters               SparkHdfsLR.scala    SparkPi.scala
DFSReadWriteTest.scala       GroupByTest.scala  LocalKMeans.scala  ml                        SimpleSkewedGroupByTest.scala  SparkKMeans.scala    SparkTC.scala
DriverSubmissionTest.scala   HdfsTest.scala     LocalLR.scala      mllib                     SkewedGroupByTest.scala        SparkLR.scala        sql
ExceptionHandlingTest.scala  LocalALS.scala     LocalPi.scala      MultiBroadcastTest.scala  SparkALS.scala                 SparkPageRank.scala  streaming
[root@hadoop1 examples]# run-example S
SimpleSkewedGroupByTest.scala  SparkALS.scala                 SparkKMeans.scala              SparkPageRank.scala            SparkTC.scala
SkewedGroupByTest.scala        SparkHdfsLR.scala              SparkLR.scala                  SparkPi.scala

#运行计算PI程序
[root@hadoop1 examples]# run-example SparkPi

2021-02-02 11:31:59 INFO  DAGScheduler:54 - Job 0 finished: reduce at SparkPi.scala:38, took 2.880396 s
Pi is roughly 3.147035735178676
2021-02-02 11:31:59 INFO  AbstractConnector:318 - Stopped Spark@7f4037ed{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
2021-02-02 11:31:59 INFO  SparkUI:54 - Stopped Spark web UI at http://hadoop1:4040
2021-02-02 11:31:59 INFO  StandaloneSchedulerBackend:54 - Shutting down all executors
2021-02-02 11:31:59 INFO  CoarseGrainedSchedulerBackend$DriverEndpoint:54 - Asking each executor to shut down
2021-02-02 11:31:59 INFO  MapOutputTrackerMasterEndpoint:54 - MapOutputTrackerMasterEndpoint stopped!
2021-02-02 11:32:00 INFO  MemoryStore:54 - MemoryStore cleared
2021-02-02 11:32:00 INFO  BlockManager:54 - BlockManager stopped
2021-02-02 11:32:00 INFO  BlockManagerMaster:54 - BlockManagerMaster stopped
2021-02-02 11:32:00 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2021-02-02 11:32:00 INFO  SparkContext:54 - Successfully stopped SparkContext
2021-02-02 11:32:00 INFO  ShutdownHookManager:54 - Shutdown hook called
2021-02-02 11:32:00 INFO  ShutdownHookManager:54 - Deleting directory /usr/local/spark/data/spark_shuffle/spark-5b21001c-fbab-4ce1-afe7-a697dcffd8b9
2021-02-02 11:32:00 INFO  ShutdownHookManager:54 - Deleting directory /tmp/spark-26692d5a-a342-4b02-aeae-5209ed6f90f0
[root@hadoop1 examples]#

hadoop安装参考：https://blog.csdn.net/zhaoyaxiong_ctu/article/details/113151938

5 SparkSQl集成hive

拷贝hadoop安装目录下文件 /usr/local/hadoop/etc/hadoop/core-site.xml hdfs-site.xml 到 /usr/local/spark/conf/目录拷贝hive安装目录下文件 /usr/local/hive/conf/hive-site.xml 到 /usr/local/spark/conf/目录

[root@hadoop1 conf]# ls -l | sort/usr/local/hadoop/etc/hadoop/
-rw-r--r--. 1 root root   1118 Feb  6 15:16 hdfs-site.xml
-rw-r--r--. 1 root root   1187 Feb  6 15:16 core-site.xml
-rw-r--r--. 1 root root   1281 Feb  2 10:51 spark-defaults.conf
-rw-r--r--. 1 root root 257513 Feb  6 16:06 hive-site.xml

拷贝hive安装目录下文件mysql驱动到spark jars目录下，并分发到其他机器

 [root@hadoop1 hive]# cp /usr/local/hive/lib/mysql-connector-java-5.1.46.jar  /usr/local/spark/jars/
[root@hadoop1 hive]# scp -r  /usr/local/spark/jars/mysql-connector-java-5.1.46.jar  hadoop2:/usr/local/spark/jars/

启动spark-sql

[root@hadoop2 conf]# spark-sql --driver-class-path /usr/local/spark/jars/mysql-connector-java-5.1.46.jar
2021-02-06 16:07:24 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2021-02-06 16:07:25 WARN  HiveConf:2753 - HiveConf of name hive.metastore.client.capability.check does not exist
2021-02-06 16:07:25 WARN  HiveConf:2753 - HiveConf of name hive.metastore.hbase.aggregate.stats.false.positive.probability does not exist
2021-02-06 16:07:25 WARN  HiveConf:2753 - HiveConf of name hive.druid.broker.address.default does not exist
2021-02-06 16:07:25 WARN  HiveConf:2753 - HiveConf of name hive.llap.io.orc.time.counters does not exist

出现以下错误

Caused by: MetaException(message:Hive Schema version 1.2.0 does not match metastore's schema version 2.3.0 Metastore is not upgraded or corrupt)
        at org.apache.hadoop.hive.metastore.ObjectStore.checkSchema(ObjectStore.java:6679)
        at org.apache.hadoop.hive.metastore.ObjectStore.verifySchema(ObjectStore.java:6645)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)

修改/usr/local/spark/conf/hive-site.xml参数，不执行校验

<property>    
<name>hive.metastore.schema.verification</name>    
<value>false</value> 
</property>

再次启动成功

[root@hadoop2 conf]# spark-sql --driver-class-path /usr/local/spark/jars/mysql-connector-java-5.1.46.jar
2021-02-06 16:38:39 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2021-02-06 16:38:40 WARN  HiveConf:2753 - HiveConf of name hive.metastore.client.capability.check does not exist



         > show databases;

2021-02-06 16:39:27 INFO  HiveMetaStore:746 - 0: get_database: global_temp
2021-02-06 16:39:27 INFO  audit:371 - ugi=root  ip=unknown-ip-addr      cmd=get_database: global_temp
2021-02-06 16:39:27 WARN  ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
2021-02-06 16:39:30 INFO  HiveMetaStore:746 - 0: get_databases: *
2021-02-06 16:39:30 INFO  audit:371 - ugi=root  ip=unknown-ip-addr      cmd=get_databases: *
2021-02-06 16:39:30 INFO  CodeGenerator:54 - Code generated in 411.707014 ms
default
hero
Time taken: 3.734 seconds, Fetched 2 row(s)
2021-02-06 16:39:30 INFO  SparkSQLCLIDriver:951 - Time taken: 3.734 seconds, Fetched 2 row(s)
spark-sql>