Spark（hive原数据准备）抽取MySQL指定数据表中的增量数据到Hive

最新推荐文章于 2025-03-19 17:03:50 发布

红笺Code

最新推荐文章于 2025-03-19 17:03:50 发布

阅读量629

点赞数 11

CC 4.0 BY-SA版权

文章标签： spark hive mysql etl 大数据分布式

本文链接：https://blog.csdn.net/qq_31710395/article/details/144557049

使用Scala编写spark工程代码，将MySQL的ds_db01库中表customer_inf、order_detail、order_master、product_info的部分数据抽取到Hive的ods库

SQL文件下载:

123云盘:https://www.123684.com/s/kgrAjv-z7q3d

阿里云盘:https://www.alipan.com/s/A5QjDUv8PXQ

pom文件配置添加:

    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.37</version>
    </dependency>

1.customer_inf

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "customer_inf"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("customer_inf")
    val df = spark.sql("SELECT *  FROM customer_inf where modified_time<'2022-08-21 22:34:46' and modified_time>'2022-08-20 22:19:36'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.customer_inf")
  }
}

2.product_info

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "product_info"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("product_info")
    val df = spark.sql("SELECT * FROM product_info where modified_time >'2022-09-03 04:09:42' and modified_time <'2022-09-04 00:32:00'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.product_info")
  }
}

3.order_master

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit

import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}

object mysqltohive3 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "order_master"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_master")
    val df = spark.sql("SELECT * FROM order_master where modified_time>'2022-03-17 10:05:28' and modified_time<'2022-03-20 15:20:45';")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_master")
  }
}

4.order_detail

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit

import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}

object mysqltohive4 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
    val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "order_detail"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_detail")
    val df = spark.sql("SELECT * FROM order_detail where modified_time>'2022-03-17 21:07:34' and modified_time <'2022-03-18 23:18:34'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_detail")
  }
}