import os
import hashlib
import shutil
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import logging
from typing import Dict, List, Optional, Set
class SmartBackup:
"""智能文件备份与同步工具"""
def __init__(self, source_dir: str, backup_dir: str,
max_workers: int = 4,
hash_algorithm: str = 'sha256'):
"""
参数:
source_dir: 源目录
backup_dir: 备份目录
max_workers: 最大并发数
hash_algorithm: 哈希算法(md5/sha1/sha256)
"""
self.source_dir = Path(source_dir).resolve()
self.backup_dir = Path(backup_dir).resolve()
self.max_workers = max_workers
self.hash_algorithm = hash_algorithm
self.logger = self._setup_logger()
self.stats = {
'total_files': 0,
'copied': 0,
'updated': 0,
'skipped': 0,
'errors': 0
}
def _setup_logger(self) -> logging.Logger:
"""配置日志记录器"""
logger = logging.getLogger('SmartBackup')
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def _get_file_hash(self, file_path: Path) -> str:
"""计算文件哈希值"""
hash_obj = hashlib.new(self.hash_algorithm)
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
def _get_backup_path(self, file_path: Path) -> Path:
"""获取备份文件路径"""
relative_path = file_path.relative_to(self.source_dir)
return self.backup_dir / relative_path
def _should_update(self, source_path: Path, backup_path: Path) -> bool:
"""判断是否需要更新备份"""
if not backup_path.exists():
return True
# 比较文件大小和修改时间
source_stat = source_path.stat()
backup_stat = backup_path.stat()
if source_stat.st_size != backup_stat.st_size:
return True
if source_stat.st_mtime > backup_stat.st_mtime:
return True
# 如果大小和时间相同,比较哈希值
return self._get_file_hash(source_path) != self._get_file_hash(backup_path)
def _sync_file(self, file_path: Path) -> Optional[Dict]:
"""同步单个文件"""
if not file_path.is_file():
return None
self.stats['total_files'] += 1
backup_path = self._get_backup_path(file_path)
try:
# 创建目标目录
backup_path.parent.mkdir(parents=True, exist_ok=True)
if self._should_update(file_path, backup_path):
shutil.copy2(file_path, backup_path)
self.stats['updated'] += 1
return {
'source': str(file_path),
'backup': str(backup_path),
'action': 'updated'
}
else:
self.stats['skipped'] += 1
return {
'source': str(file_path),
'backup': str(backup_path),
'action': 'skipped'
}
except Exception as e:
self.stats['errors'] += 1
self.logger.error(f"同步文件 {file_path} 时出错: {str(e)}")
return None
def run(self, dry_run: bool = False) -> Dict:
"""执行备份同步操作"""
if dry_run:
self.logger.info("执行干运行(不实际复制文件)")
file_paths = []
for root, _, files in os.walk(self.source_dir):
for file in files:
file_path = Path(root) / file
file_paths.append(file_path)
results = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [executor.submit(self._sync_file, path) for path in file_paths]
for future in futures:
result = future.result()
if result:
results.append(result)
if not dry_run and result['action'] == 'updated':
self.logger.info(f"更新备份: {result['source']} -> {result['backup']}")
self.logger.info(f"备份完成. 统计信息: {self.stats}")
return {
'stats': self.stats,
'results': results
}
# 示例使用
if __name__ == "__main__":
# 创建备份工具实例
backup = SmartBackup(
source_dir="~/Documents", # 要备份的目录
backup_dir="~/Backups/Documents", # 备份目录
max_workers=8,
hash_algorithm='sha256'
)
# 先执行干运行查看将要进行的更改
dry_run_results = backup.run(dry_run=True)
# 确认无误后实际执行备份
if input("确认执行备份操作?(y/n): ").lower() == 'y':
backup.run(dry_run=False)
使用说明
-
功能特点:
- 智能增量备份(仅更新有变化的文件)
- 多线程高效处理大量文件
- 支持多种哈希算法校验
- 干运行模式(预览备份操作)
- 详细的统计和日志记录
- 保留目录结构
-
核心参数:
source_dir
: 要备份的源目录backup_dir
: 备份目录max_workers
: 最大并发线程数hash_algorithm
: 哈希算法(md5/sha1/sha256)
-
使用方法:
# 1. 创建备份工具实例 backup = SmartBackup( source_dir="/path/to/source", backup_dir="/path/to/backup", max_workers=4 ) # 2. 先执行干运行预览备份操作 backup.run(dry_run=True) # 3. 确认后实际执行备份 backup.run(dry_run=False)
-
应用场景:
- 重要文档定期备份
- 项目代码版本同步
- 照片/视频库备份
- 服务器文件同步
- 自动化数据保护
-
注意事项:
- 首次使用建议先执行干运行模式
- 确保备份目录有足够空间
- 处理大量文件时可能需要较长时间
- 可通过调整max_workers优化性能