Spark(hive原数据准备)抽取MySQL指定数据表中的增量数据到Hive

使用Scala编写spark工程代码,将MySQL的ds_db01库中表customer_inf、order_detail、order_master、product_info的部分数据抽取到Hive的ods库

SQL文件下载:

        123云盘:https://www.123684.com/s/kgrAjv-z7q3d

        阿里云盘:https://www.alipan.com/s/A5QjDUv8PXQ

pom文件配置添加:

    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.37</version>
    </dependency>

1.customer_inf

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "customer_inf"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("customer_inf")
    val df = spark.sql("SELECT *  FROM customer_inf where modified_time<'2022-08-21 22:34:46' and modified_time>'2022-08-20 22:19:36'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.customer_inf")
  }
}

2.product_info

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit
import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}
object mysqltohive2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "product_info"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("product_info")
    val df = spark.sql("SELECT * FROM product_info where modified_time >'2022-09-03 04:09:42' and modified_time <'2022-09-04 00:32:00'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.product_info")
  }
}

3.order_master

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit

import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}

object mysqltohive3 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "order_master"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_master")
    val df = spark.sql("SELECT * FROM order_master where modified_time>'2022-03-17 10:05:28' and modified_time<'2022-03-20 15:20:45';")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_master")
  }
}

4.order_detail

import org.apache.spark.sql._
import org.apache.spark.sql.functions.lit

import java.text.SimpleDateFormat
import java.util.{Calendar, Properties}

object mysqltohive4 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("customer_inf")
      .enableHiveSupport()
      .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
      .getOrCreate()
//    连接msyql数据库
    val jdbc = "jdbc:mysql://bigdata1:3306/ds_db01"
    val table = "order_detail"
    val properties = new Properties()
    properties.put("user","root")
    properties.put("password","123456")
    properties.put("driver","com.mysql.jdbc.Driver")
    val frame = spark.read.jdbc(jdbc, table, properties).createOrReplaceTempView("order_detail")
    val df = spark.sql("SELECT * FROM order_detail where modified_time>'2022-03-17 21:07:34' and modified_time <'2022-03-18 23:18:34'")
    val dateFormat = new SimpleDateFormat("yyddMMdd")
    val day = dateFormat.format(Calendar.getInstance().getTime.getTime - 24 * 60 * 60 * 1000)
    val frame1 = df.withColumn("etl_date", lit(day))
    frame1.write.mode("overwrite").partitionBy("etl_date").saveAsTable("ods.order_detail")
  }
}

Hive原数据准备好之后,进行增量抽取:Spark抽取MySQL指定数据表中的增量数据到Hive-CSDN博客

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值