DataX不同ETL任务json模板

本文介绍使用 DataX 进行不同数据源间的数据迁移配置方法,包括 MySQL 到 Kudu、HDFS 到 MySQL、HDFS 到 ClickHouse 等多种场景。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

项目场景:

datax同步任务json模板:


mysql -> kudu

kudu建表

DROP TABLE dim.content_test;
CREATE TABLE dim.content_test(
id string NOT NULL,
title string NOT NULL,
PRIMARY KEY (id)
)
PARTITION BY HASH (id) PARTITIONS 3
STORED AS KUDU TBLPROPERTIES ('kudu.master_addresses'='xxx:7051,xxx:7051,xxx:7051');

json

{
  "core": {
    "transport": {
      "channel": {
        "speed": {
          "byte": 104857600
        }
      }
    }
  },
  "job": {
    "setting": {
      "speed": {
        "channel": 10,
        "byte": 1048576000
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "mysqlreader",
          "parameter": {
            "username": "xxx",
            "password": "xxx",
            "splitPk": "",
            "connection": [
              {
                "querySql": [
                  "select ID,TITLE from content_test;"
                ],
                "jdbcUrl": [
                  "jdbc:mysql://xxx:3306/yy"
                ]
              }
            ]
          }
        },
        "writer": {
          "name": "kuduwriter",
          "parameter": {
            "batchSize": 1024,
            "bufferSize": 2048,
            "skipFail": false,
            "encoding": "UTF-8",
            "kuduConfig": {
              "kudu.master_addresses": "xxx:7051,xxx:7051,xxx:7051"
            },
            "table": "impala::dim.content_test",
            "truncate": false,
            "writeMode": "upsert",
            "column": [
              {
                "index": 0,
                "name": "id",
                "type": "int",
                "comment": "内容id",
                "primaryKey": true
              },
              {
                "index": 1,
                "name": "title",
                "type": "string",
                "comment": "内容名称"
              }
            ],
            "primaryKey": [
              {
                "index": 0,
                "name": "id",
                "type": "int"
              }
            ]
          }
        }
      }
    ]
  }
}

参考网址:https://github.com/alibaba/DataX/blob/master/kuduwriter/doc/kuduwirter.md

hdfs -> mysql

json

{
  "core": {
    "transport": {
      "channel": {
        "speed": {
          "byte": 104857600
        }
      }
    }
  },
  "job": {
    "setting": {
      "speed": {
        "channel": 10,
        "byte": 1048576000
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "mysqlreader",
          "parameter": {
            "username": "xxx",
            "password": "xxx",
            "column": [
              "`id`",
              "`ww`"
            ],
            "splitPk": "",
            "connection": [
              {
                "table": [
                  "comp_tx"
                ],
                "jdbcUrl": [
                  "jdbc:mysql://XXX:3312/ddd"
                ]
              }
            ]
          }
        },
        "writer": {
          "name": "hdfswriter",
          "parameter": {
            "defaultFS": "hdfs://name1",
            "nullFormat": "\\N",
            "hadoopConfig": {
              "dfs.nameservices": "nameservice1",
              "dfs.ha.namenodes.nameservice1": "aa,bb",
              "dfs.namenode.rpc-address.nameservice1.aa": "hdfs://XXX:8020",
              "dfs.namenode.rpc-address.nameservice1.bb": "hdfs://XXX:8020",
              "dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "fileType": "orc",
            "path": "/user/hive/dim/XXX",
            "fileName": "XXX",
            "writeMode": "append",
            "fieldDelimiter": "\u0001",
            "column": [
              {
                "name": "id",
                "type": "string"
              },
              {
                "name": "ww",
                "type": "string"
              }
            ]
          }
        }
      }
    ]
  }
}

参考网址:https://github.com/alibaba/DataX/blob/master/hdfswriter/doc/hdfswriter.md

hdfs -> clickhouse

json

{
  "core": {
    "transport": {
      "channel": {
        "speed": {
          "byte": 104857600
        }
      }
    }
  },
  "job": {
    "setting": {
      "speed": {
        "channel": 10,
        "byte": 1048576000
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "hdfsreader",
          "parameter": {
            "path": "/user/hive/test01",
            "nullFormat": "\\N",
            "defaultFS": "hdfs://name1",
            "hadoopConfig": {
              "dfs.nameservices": "xx",
              "dfs.ha.namenodes.nameservice1": "xx01,xx02",
              "dfs.namenode.rpc-address.nameservice1.xx01": "hdfs://xxx:8020",
              "dfs.namenode.rpc-address.nameservice1.xx02": "hdfs://xxx:8020",
              "dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "fileType": "orc",
            "fieldDelimiter": "\u0001",
            "skipHeader": true,
            "column": [
              {
                "index": "0",
                "type": "string"
              },
              {
                "index": "1",
                "type": "string"
              },
              {
                "index": "2",
                "type": "string"
              },
              {
                "index": "3",
                "type": "string"
              }
            ]
          }
        },
        "writer": {
          "name": "clickhousewriter",
          "parameter": {
            "username": "xxxx",
            "password": "xxxx",
            "column": [
              "event",
              "aa",
              "bb",
              "dt"
            ],
            "connection": [
              {
                "table": [
                  "ww"
                ],
                "jdbcUrl": "jdbc:clickhouse://xxx:8123/test"
              }
            ]
          }
        }
      }
    ]
  }
}

hdfs -> textfile

json

{
  "core": {
    "transport": {
      "channel": {
        "speed": {
          "byte": 104857600
        }
      }
    }
  },
  "job": {
    "setting": {
      "speed": {
        "channel": 10,
        "byte": 1048576000
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "hdfsreader",
          "parameter": {
            "path": "/user/hive/warehouse/*.0.",
            "defaultFS": "hdfs://NameServiceHA",
            "hadoopConfig": {
              "dfs.nameservices": "NameServiceHA",
              "dfs.ha.namenodes.NameServiceHA": "data01,data02",
              "dfs.namenode.rpc-address.NameServiceHA.data01": "hdfs://XXX:8020",
              "dfs.namenode.rpc-address.NameServiceHA.data02": "hdfs://XXX:8020",
              "dfs.client.failover.proxy.provider.NameServiceHA": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "column": [
              "*"
            ],
            "fileType": "TEXT",
            "fieldDelimiter": "\u0001"
          }
        },
        "writer": {
          "name": "txtfilewriter",
          "parameter": {
            "path": "/data/team/",
            "fileName": "test",
            "writeMode": "truncate",
            "dateFormat": "yyyy-MM-dd"
          }
        }
      }
    ]
  }
}

hdfs -> ftp

json

  "core": {
    "transport": {
      "channel": {
        "speed": {
          "byte": 104857600
        }
      }
    }
  },
  "job": {
    "setting": {
      "speed": {
        "channel": 10,
        "byte": 1048576000
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "hdfsreader",
          "parameter": {
            "path": "/user/hive/warehouse/*.0.",
            "defaultFS": "hdfs://NameServiceHA",
            "hadoopConfig": {
              "dfs.nameservices": "NameServiceHA",
              "dfs.ha.namenodes.NameServiceHA": "data01,data02",
              "dfs.namenode.rpc-address.NameServiceHA.data01": "hdfs://xxx:8020",
              "dfs.namenode.rpc-address.NameServiceHA.data02": "hdfs://xxx:8020",
              "dfs.client.failover.proxy.provider.NameServiceHA": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "column": [
              "*"
            ],
            "fileType": "TEXT",
            "fieldDelimiter": "\u0001"
          }
        },
        "writer": {
          "name": "ftpwriter",
          "parameter": {
            "protocol": "ftp",
            "host": "xxx",
            "port": 21,
            "username": "xxx",
            "password": "xxx",
            "connectPattern": "PASV",
            "path": "/user/20220705/",
            "fileName": "test",
            "writeMode": "truncate",
            "fieldDelimiter": ",",
            "timeout": "3600000",
            "encoding": "UTF-8",
            "nullFormat": "\\N",
            "dateFormat": "yyyy-MM-dd",
            "fileFormat": "text",
            "suffix": ".text",
            "header": []
          }
        }
      }
    ]
  }
}

mysql -> clickhouse

json

{
  "job": {
    "setting": {
      "speed": {
        "channel": 3,
        "byte": 1048576
      },
      "errorLimit": {
        "record": 0,
        "percentage": 0.02
      }
    },
    "content": [
      {
        "reader": {
          "name": "mysqlreader",
          "parameter": {
            "username": "xxx",
            "password": "xxx",
            "splitPk": "",
            "connection": [
              {
                "querySql": [
                  "select id,code FROM test.aa"
                ],
                "jdbcUrl": [
                  "jdbc:mysql://xxx:3306/test"
                ]
              }
            ]
          }
        },
        "writer": {
          "name": "clickhousewriter",
          "parameter": {
            "username": "xxx",
            "password": "xxx",
            "column": [
              "id",
              "code"
            ],
            "connection": [
              {
                "table": [
                  "test01"
                ],
                "jdbcUrl": "jdbc:clickhouse://xxx:8123/test"
              }
            ]
          }
        }
      }
    ]
  }
}

hdfs -> hdfs

json

{
  "job": {
    "setting": {
      "speed": {
        "channel": 10
      }
    },
    "content": [
      {
        "reader": {
          "name": "hdfsreader",
          "parameter": {
            "path": "/user/hive/warehouse/*.0.parq",
            "defaultFS": "hdfs://NameServiceHA",
            "hadoopConfig": {
              "dfs.nameservices": "NameServiceHA",
              "dfs.ha.namenodes.NameServiceHA": "test01,test02",
              "dfs.namenode.rpc-address.NameServiceHA.data01": "hdfs://xxx:8020",
              "dfs.namenode.rpc-address.NameServiceHA.data02": "hdfs://xxx:8020",
              "dfs.client.failover.proxy.provider.NameServiceHA": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "column": [
              "*"
            ],
            "fileType": "PARQUET",
            "fieldDelimiter": "\u0001"
          }
        },
        "writer": {
          "name": "hdfswriter",
          "parameter": {
            "defaultFS": "hdfs://test",
            "hadoopConfig": {
              "dfs.nameservices": "test",
              "dfs.ha.namenodes.nameservice1": "test01,test02",
              "dfs.namenode.rpc-address.nameservice1.test01": "hdfs://xxx:8020",
              "dfs.namenode.rpc-address.nameservice1.test02": "hdfs://xxx:8020",
              "dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
              "dfs.ha.automatic-failover.enabled.yournamespace": "true"
            },
            "fileType": "TEXT",
            "path": "/newtv/hive/test_temp",
            "fileName": "test_temp",
            "column": [
              {
                "name": "aa",
                "type": "string"
              },
              {
                "name": "bb",
                "type": "string"
              }
            ],
            "writeMode": "append",
            "fieldDelimiter": "^"
          }
        }
      }
    ]
  }
}
### ETL 工具 DataX 使用指南和配置教程 #### 安装与环境准备 为了使用 DataX 进行数据同步操作,需先满足一些基本的前置条件。Java 开发工具包 (JDK) 是必需的组件之一[^2]。 安装完成后,可以通过简单的命令来验证 JDK 是否成功安装: ```bash java -version ``` #### 配置 DataX 项目结构 DataX 支持插件式的架构设计,这意味着可以根据需求灵活加载不同类型的读写器(reader/writer)[^1]。通常情况下,一个完整的 DataX 作业配置文件会包含以下几个部分: - **job**: 描述本次任务的整体属性。 - **setting**: 设置并发数、错误记录条目数量等参数。 - **content**: 列表形式定义多个具体的数据传输子任务(taskGroup),每个 taskGroup 下又细分为 reader 和 writer 的设置项。 下面是一个基础 JSON 格式的配置模板示例: ```json { "job": { "content":[ { "reader":{ "name":"mysqlreader", "parameter":{ ... } }, "writer":{ "name":"streamwriter", "parameter":{ ... } } } ], "setting":{} } } ``` 此模板展示了如何指定 MySQL 数据库作为源端以及控制台输出流为目标端之间的简单迁移过程。 #### 提升稳定性措施 考虑到实际应用场景中的复杂性和不确定性,DataX 特别注重提高系统的健壮性。针对可能出现的各种异常状况,比如网络波动或资源不可达等问题,实现了多级别的自动恢复机制——包括但不限于线程内重试、跨线程的任务重启等功能特性。这些改进有助于确保即使遇到突发问题也能尽可能保障数据同步工作的连续性和可靠性。 #### 日志监控功能 除了强大的容错能力外,DataX 还提供了详尽的日志记录服务。通过内置的日志系统,不仅可以追踪每一步执行状态的变化,还能获取有关吞吐量统计、内存占用率等方面的关键指标信息,便于后续分析优化工作。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值