PYTHON实现智能文件备份与同步工具

import os
import hashlib
import shutil
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import logging
from typing import Dict, List, Optional, Set

class SmartBackup:
    """智能文件备份与同步工具"""
    
    def __init__(self, source_dir: str, backup_dir: str, 
                 max_workers: int = 4, 
                 hash_algorithm: str = 'sha256'):
        """
        参数:
            source_dir: 源目录
            backup_dir: 备份目录
            max_workers: 最大并发数
            hash_algorithm: 哈希算法(md5/sha1/sha256)
        """
        self.source_dir = Path(source_dir).resolve()
        self.backup_dir = Path(backup_dir).resolve()
        self.max_workers = max_workers
        self.hash_algorithm = hash_algorithm
        self.logger = self._setup_logger()
        self.stats = {
            'total_files': 0,
            'copied': 0,
            'updated': 0,
            'skipped': 0,
            'errors': 0
        }
    
    def _setup_logger(self) -> logging.Logger:
        """配置日志记录器"""
        logger = logging.getLogger('SmartBackup')
        logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler = logging.StreamHandler()
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        return logger
    
    def _get_file_hash(self, file_path: Path) -> str:
        """计算文件哈希值"""
        hash_obj = hashlib.new(self.hash_algorithm)
        with open(file_path, 'rb') as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()
    
    def _get_backup_path(self, file_path: Path) -> Path:
        """获取备份文件路径"""
        relative_path = file_path.relative_to(self.source_dir)
        return self.backup_dir / relative_path
    
    def _should_update(self, source_path: Path, backup_path: Path) -> bool:
        """判断是否需要更新备份"""
        if not backup_path.exists():
            return True
        
        # 比较文件大小和修改时间
        source_stat = source_path.stat()
        backup_stat = backup_path.stat()
        
        if source_stat.st_size != backup_stat.st_size:
            return True
        
        if source_stat.st_mtime > backup_stat.st_mtime:
            return True
        
        # 如果大小和时间相同,比较哈希值
        return self._get_file_hash(source_path) != self._get_file_hash(backup_path)
    
    def _sync_file(self, file_path: Path) -> Optional[Dict]:
        """同步单个文件"""
        if not file_path.is_file():
            return None
            
        self.stats['total_files'] += 1
        backup_path = self._get_backup_path(file_path)
        
        try:
            # 创建目标目录
            backup_path.parent.mkdir(parents=True, exist_ok=True)
            
            if self._should_update(file_path, backup_path):
                shutil.copy2(file_path, backup_path)
                self.stats['updated'] += 1
                return {
                    'source': str(file_path),
                    'backup': str(backup_path),
                    'action': 'updated'
                }
            else:
                self.stats['skipped'] += 1
                return {
                    'source': str(file_path),
                    'backup': str(backup_path),
                    'action': 'skipped'
                }
        
        except Exception as e:
            self.stats['errors'] += 1
            self.logger.error(f"同步文件 {file_path} 时出错: {str(e)}")
            return None
    
    def run(self, dry_run: bool = False) -> Dict:
        """执行备份同步操作"""
        if dry_run:
            self.logger.info("执行干运行(不实际复制文件)")
        
        file_paths = []
        for root, _, files in os.walk(self.source_dir):
            for file in files:
                file_path = Path(root) / file
                file_paths.append(file_path)
        
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self._sync_file, path) for path in file_paths]
            for future in futures:
                result = future.result()
                if result:
                    results.append(result)
                    if not dry_run and result['action'] == 'updated':
                        self.logger.info(f"更新备份: {result['source']} -> {result['backup']}")
        
        self.logger.info(f"备份完成. 统计信息: {self.stats}")
        return {
            'stats': self.stats,
            'results': results
        }

# 示例使用
if __name__ == "__main__":
    # 创建备份工具实例
    backup = SmartBackup(
        source_dir="~/Documents",  # 要备份的目录
        backup_dir="~/Backups/Documents",  # 备份目录
        max_workers=8,
        hash_algorithm='sha256'
    )
    
    # 先执行干运行查看将要进行的更改
    dry_run_results = backup.run(dry_run=True)
    
    # 确认无误后实际执行备份
    if input("确认执行备份操作?(y/n): ").lower() == 'y':
        backup.run(dry_run=False)

使用说明

  1. ​功能特点​​:

    • 智能增量备份(仅更新有变化的文件)
    • 多线程高效处理大量文件
    • 支持多种哈希算法校验
    • 干运行模式(预览备份操作)
    • 详细的统计和日志记录
    • 保留目录结构
  2. ​核心参数​​:

    • source_dir: 要备份的源目录
    • backup_dir: 备份目录
    • max_workers: 最大并发线程数
    • hash_algorithm: 哈希算法(md5/sha1/sha256)
  3. ​使用方法​​:

    # 1. 创建备份工具实例
    backup = SmartBackup(
        source_dir="/path/to/source",
        backup_dir="/path/to/backup",
        max_workers=4
    )
    
    # 2. 先执行干运行预览备份操作
    backup.run(dry_run=True)
    
    # 3. 确认后实际执行备份
    backup.run(dry_run=False)
  4. ​应用场景​​:

    • 重要文档定期备份
    • 项目代码版本同步
    • 照片/视频库备份
    • 服务器文件同步
    • 自动化数据保护
  5. ​注意事项​​:

    • 首次使用建议先执行干运行模式
    • 确保备份目录有足够空间
    • 处理大量文件时可能需要较长时间
    • 可通过调整max_workers优化性能
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值