运行代码:
import pandas as pd
import numpy as np
import os
import sys
import io
# ------------------------- 编码兼容设置 -------------------------
# 强制标准输出/错误流使用UTF-8编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
# 设置Python环境变量,避免joblib并行计算时的编码问题
os.environ["PYTHONIOENCODING"] = "utf-8"
# ---------------------------------------------------------------
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
# 1. 加载数据(明确指定文件路径编码)
file_path = r"C:\Users\刘涵\Desktop\数模标准\模拟题一\C题\卷烟吸阻数据.xlsx" # 原始路径(含中文)
# 读取原始数据(Sheet1)或标准化数据(Sheet2)
try:
df = pd.read_excel(file_path, sheet_name='Sheet1') # 原始数据
# df = pd.read_excel(file_path, sheet_name='Sheet2') # 标准化数据(可选)
except UnicodeDecodeError:
# 如果仍报错,尝试用二进制模式读取并指定编码(仅适用于xlsx文件)
df = pd.read_excel(file_path, engine='openpyxl', sheet_name='Sheet1')
# 显示数据基本信息
print("数据基本信息:")
print(df.info())
print("\n前5行数据:")
print(df.head())
print("\n描述性统计:")
print(df.describe())
# 2. 数据预处理
X = df.drop(columns=['吸阻(Pa)']) # 特征
y = df['吸阻(Pa)'] # 目标变量
# 数据标准化(仅对原始数据需要,若使用Sheet2的标准化数据则跳过)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
# 4. 定义模型及超参数网格
models = {
'线性回归': LinearRegression(),
'岭回归': Ridge(),
'Lasso回归': Lasso(),
'随机森林': RandomForestRegressor(random_state=42),
'梯度提升': GradientBoostingRegressor(random_state=42),
'支持向量机': SVR()
}
param_grids = {
'岭回归': {'alpha': [0.1, 1, 10]},
'Lasso回归': {'alpha': [0.001, 0.01, 0.1]},
'随机森林': {
'n_estimators': [100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5]
},
'梯度提升': {
'n_estimators': [100, 200],
'learning_rate': [0.05, 0.1, 0.2],
'max_depth': [3, 5]
},
'支持向量机': {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto']
}
}
# 5. 模型训练与调优(增加异常捕获)
results = {}
best_models = {}
for name, model in models.items():
print(f"\n=== 训练模型: {name} ===")
try:
if name in param_grids:
# 使用网格搜索优化超参数
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grids[name],
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1 # 若仍报错,可改为n_jobs=1(关闭并行)
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"最佳参数: {best_params}")
else:
# 直接训练基础模型
best_model = model.fit(X_train, y_train)
best_params = "无超参数可调"
# 预测与评估
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R²': r2,
'最佳参数': best_params
}
best_models[name] = best_model
print(f"测试集评估结果:\nMSE={mse:.2f}\nRMSE={rmse:.2f}\nMAE={mae:.2f}\nR²={r2:.4f}")
except Exception as e:
print(f"训练模型 {name} 时出错: {str(e)}")
continue
# 6. 模型性能对比
if results:
results_df = pd.DataFrame(results).T
print("\n=== 模型性能对比 ===")
print(results_df.sort_values('R²', ascending=False))
else:
print("所有模型训练失败,请检查数据和环境配置。")
# 7. 可视化分析(仅当有模型成功时执行)
if '随机森林' in best_models:
try:
# 特征重要性
feature_importances = best_models['随机森林'].feature_importances_
feature_importance_df = pd.DataFrame({
'特征': X.columns,
'重要性': feature_importances
}).sort_values('重要性', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='重要性', y='特征', data=feature_importance_df.head(10))
plt.title('Top 10 重要特征')
plt.show()
except Exception as e:
print(f"绘制特征重要性图时出错: {str(e)}")
# 8. 模型保存(可选)
# from joblib import dump
# if '随机森林' in best_models:
# dump(best_models['随机森林'], 'best_cigarette_resistance_model.pkl')
# dump(scaler, 'scaler.pkl')
出现:
AttributeError Traceback (most recent call last)
Cell In[1], line 9
5 import io
7 # ------------------------- 编码兼容设置 -------------------------
8 # 强制标准输出/错误流使用UTF-8编码
----> 9 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
10 sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
11 # 设置Python环境变量,避免joblib并行计算时的编码问题
AttributeError: 'OutStream' object has no attribute 'buffer'。输出完整的解决问题修改后的代码
最新发布