使用Scala编写spark工程代码,将MySQL的ds_db01库中表customer_inf、order_detail、order_master、product_info的部分数据抽取到Hive的ods库
SQL文件下载:
123云盘:https://www.123684.com/s/kgrAjv-z7q3d
阿里云盘:https://www.alipan.com/s/A5QjDUv8PXQ
pom文件配置添加:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.37</version>
</dependency>
1.customer_inf
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("customer_inf")
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
.getOrCreate()
// 连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
val table = "customer_inf"
val properties = new Properties()
properties.put("user","root")
properties.put("password","123456")
properties.put("driver","com.mysql.jdbc.Driver")
val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("customer_inf")
val df = spark.sql("SELECT * FROM customer_inf where modified_time<'2022-08-21 22:34:46' and modified_time>'2022-08-20 22:19:36'")
val dateFormat = new SimpleDateFormat("yyddMMdd")
val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
val frame1 = df.withColumn("etl_date", lit(day))
frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.customer_inf")
}
}
2.product_info
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("customer_inf")
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
.getOrCreate()
// 连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
val table = "product_info"
val properties = new Properties()
properties.put("user","root")
properties.put("password","123456")
properties.put("driver","com.mysql.jdbc.Driver")
val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("product_info")
val df = spark.sql("SELECT * FROM product_info where modified_time >'2022-09-03 04:09:42' and modified_time <'2022-09-04 00:32:00'")
val dateFormat = new SimpleDateFormat("yyddMMdd")
val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
val frame1 = df.withColumn("etl_date", lit(day))
frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.product_info")
}
}
3.order_master
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive3 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("customer_inf")
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
.getOrCreate()
// 连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
val table = "order_master"
val properties = new Properties()
properties.put("user","root")
properties.put("password","123456")
properties.put("driver","com.mysql.jdbc.Driver")
val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_master")
val df = spark.sql("SELECT * FROM order_master where modified_time>'2022-03-17 10:05:28' and modified_time<'2022-03-20 15:20:45';")
val dateFormat = new SimpleDateFormat("yyddMMdd")
val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
val frame1 = df.withColumn("etl_date", lit(day))
frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_master")
}
}
4.order_detail
import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive4 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("customer_inf")
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "/user/hive/warehouse")
.getOrCreate()
// 连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
val table = "order_detail"
val properties = new Properties()
properties.put("user","root")
properties.put("password","123456")
properties.put("driver","com.mysql.jdbc.Driver")
val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_detail")
val df = spark.sql("SELECT * FROM order_detail where modified_time>'2022-03-17 21:07:34' and modified_time <'2022-03-18 23:18:34'")
val dateFormat = new SimpleDateFormat("yyddMMdd")
val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
val frame1 = df.withColumn("etl_date", lit(day))
frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_detail")
}
}
Hive原数据准备好之后,进行增量抽取:Spark抽取MySQL指定数据表中的增量数据到Hive-CSDN博客