JuniversalCharde-Java实现自动编码转换全解

小巫程序Demo日记

于 2025-03-14 16:49:49 发布

阅读量1k

点赞数 16

CC 4.0 BY-SA版权

分类专栏： Java基础 Java Spark+Hadoop学习文章标签： java 开发语言

本文链接：https://blog.csdn.net/m0_73889530/article/details/146261369

Java 同时被 3 个专栏收录

24 篇文章

订阅专栏

Java基础

23 篇文章

订阅专栏

Spark+Hadoop学习

23 篇文章

订阅专栏

好的！下面我将为您提供一个完整的 Java Demo，展示如何使用 Juniversalchardet 库自动检测 .xlsx、.gbk 和 .txt 文件的编码，并针对 .xlsx 文件进行特殊处理。这个 Demo 可以直接运行，只需准备好您的文件并调整文件路径即可。

项目准备

环境要求

Java 8 或更高版本
Maven（用于管理依赖）

添加依赖

创建一个 Maven 项目，在 pom.xml 中添加以下依赖：

<project xmlns="http://maven.apache.org/POM/4.0.0" 
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.example</groupId>
    <artifactId>encoding-detection-demo</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!-- Juniversalchardet for encoding detection -->
        <dependency>
            <groupId>com.github.albfernandez</groupId>
            <artifactId>juniversalchardet</artifactId>
            <version>2.4.0</version>
        </dependency>
        <!-- Apache POI for Excel file handling -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

运行 mvn install 确保依赖下载成功。

以下是针对您查询的完整回答，我会根据您的要求，把代码的注释写全，知识点梳理清楚，并且用大白话解释技术要点和逻辑思想，确保您能轻松理解代码的每个部分。

完整代码与详细中文注释

以下是三个类的完整代码，包含详细的中文注释，涵盖技术要点和逻辑思想。

1. EncodingDetector 类

import org.mozilla.universalchardet.UniversalDetector;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * EncodingDetector 类：负责自动检测文件的字符编码。
 * 技术要点：
 * - 使用 Juniversalchardet 库（基于 Mozilla Universal Character Set Detector）来检测文件的编码。
 * - 支持多种常见编码，例如 UTF-8、GBK、ISO-8859-1 等。
 * - 对于 .xlsx 文件（Excel 文件），因为它是二进制格式，无法检测编码，所以直接返回提示信息。
 *
 * 逻辑思想：
 * - 先判断文件是不是 .xlsx 格式，如果是，直接告诉用户这是 Excel 文件，不用检测编码。
 * - 对于其他文件，用缓冲区一块一块地读取内容，交给 UniversalDetector 去分析。
 * - UniversalDetector 会根据这些数据块的特征，逐步判断出可能的编码。
 * - 读完文件后，结束检测，拿到结果。如果没检测到编码，就默认返回 "UTF-8"。
 */
public class EncodingDetector {
    public static String detectEncoding(String filePath) throws IOException {
        // 检查文件是否以 .xlsx 结尾
        if (filePath.endsWith(".xlsx")) {
            return "Excel file (binary format, encoding detection not applicable)"; // 返回提示信息
        }

        byte[] buf = new byte[4096]; // 创建一个 4096 字节的缓冲区，用于分块读取文件
        try (FileInputStream fis = new FileInputStream(filePath)) { // 打开文件输入流
            UniversalDetector detector = new UniversalDetector(null); // 初始化编码检测器
            int nread; // 记录每次读取的字节数
            // 循环读取文件，每次读一小块，直到文件读完或检测器完成工作
            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
                detector.handleData(buf, 0, nread); // 把读取到的数据交给检测器分析
            }
            detector.dataEnd(); // 告诉检测器数据输入结束了，开始给出结果
            String encoding = detector.getDetectedCharset(); // 获取检测到的编码
            detector.reset(); // 重置检测器状态，为下次检测做准备
            // 如果检测到编码就返回结果，否则默认返回 UTF-8
            return encoding != null ? encoding : "UTF-8";
        }
    }
}

2. ExcelReader 类

import org.apache.poi.ss.usermodel.*;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * ExcelReader 类：负责读取 .xlsx 文件的内容并打印出来。
 * 技术要点：
 * - 使用 Apache POI 库来解析和读取 Excel 文件（支持 2007+ 的 .xlsx 格式）。
 * - 可以访问 Excel 文件中的工作表、行和单元格。
 *
 * 逻辑思想：
 * - 用文件输入流打开 Excel 文件，然后创建一个工作簿对象。
 * - 从工作簿中取出第一个工作表（Sheet），然后一行一行地读。
 * - 对于每一行，遍历每个单元格，把内容打印出来，方便查看。
 */
public class ExcelReader {
    public static void readExcel(String filePath) throws IOException {
        // 打开文件输入流，并创建工作簿
        try (FileInputStream fis = new FileInputStream(filePath);
             Workbook workbook = WorkbookFactory.create(fis)) {
            Sheet sheet = workbook.getSheetAt(0); // 获取第一个工作表（索引从 0 开始）
            System.out.println("Content of " + filePath + ":"); // 打印文件路径提示
            for (Row row : sheet) { // 遍历工作表中的每一行
                for (Cell cell : row) { // 遍历每行中的每个单元格
                    System.out.print(cell.toString() + "\t"); // 打印单元格内容，用制表符分隔
                }
                System.out.println(); // 每行结束后换行
            }
        }
    }
}

3. Main 类

import java.io.IOException;

/**
 * Main 类：程序的入口，负责调用编码检测和 Excel 文件读取的功能。
 * 技术要点：
 * - 整合 EncodingDetector 和 ExcelReader 两个类的功能。
 * - 支持处理多种文件类型（.xlsx、.gbk、.txt 等）。
 *
 * 逻辑思想：
 * - 定义一个文件路径数组，里面是要处理的文件。
 * - 遍历每个文件，先检测编码，然后根据文件类型决定下一步操作。
 * - 如果是 .xlsx 文件，就额外读取并打印内容；其他文件只打印编码。
 * - 用 try-catch 处理可能出现的异常，避免程序崩溃。
 */
public class Main {
    public static void main(String[] args) {
        // 定义要处理的文件路径数组（请替换为您的实际路径）
        String[] filePaths = {
            "D:/test/sample.xlsx",
            "D:/test/sample.gbk",
            "D:/test/sample.txt"
        };

        // 遍历每个文件路径
        for (String filePath : filePaths) {
            try {
                // 调用 EncodingDetector 检测文件编码
                String encoding = EncodingDetector.detectEncoding(filePath);
                System.out.println("File: " + filePath + " | Encoding: " + encoding);

                // 如果文件是 .xlsx 格式，调用 ExcelReader 读取内容
                if (filePath.endsWith(".xlsx")) {
                    ExcelReader.readExcel(filePath);
                }
            } catch (IOException e) {
                // 如果处理文件时出错，打印错误信息
                System.err.println("Error processing file: " + filePath);
                e.printStackTrace();
            }
            System.out.println("------------------------"); // 分隔线，便于阅读输出
        }
    }
}