hnust数据挖掘课设（综合创新实训2）

原创已于 2024-12-11 18:16:44 修改 · 771 阅读

4 ·

CC 4.0 BY-SA版权

文章标签：

#数据挖掘

于 2024-12-11 16:54:01 首次发布

内容为湖科大2022级数据挖掘课设（10选5，采用的程序语言注意是python）

ps:由于还处在课设期间，希望各位尽量自己动手，不要照搬代码，谢谢各位配合。

同时由于没有正确答案，错误之处欢迎指正

传送门：
grey66.cn

该题由于数据的重复性太大生成的关联规则几乎都是强关联规则几乎所有的频繁项集的支持度都大于最小支持度，故关联规则不做过多的展示，生成的频繁项目集表格如下

from itertools import combinations
#引入了 Python 内置模块 itertools 中的 combinations 函数 其作用是生成输入数据的所有可能的无序组合（不重复） 它可以用来生成指定长度的组合
'''created by grey'''
def generate_candidates(frequent_itemsets, k):
    """生成候选项集"""
    candidates = set()
    itemsets = list(frequent_itemsets)
    for i in range(len(itemsets)):
        for j in range(i + 1, len(itemsets)):
            union_set = itemsets[i].union(itemsets[j])
            if len(union_set) == k:
                candidates.add(union_set)
    return candidates

def prune_candidates(candidates, transactions, min_support_count):
    """根据支持度阈值筛选频繁项集"""
    item_count = {item: 0 for item in candidates}#始化一个字典，用来记录每个候选项集的出现次数，并将初始值设为 0
    for transaction in transactions:
        for item in candidates:
            if item.issubset(transaction): #如果候选集项是交易记录的子集
                item_count[item] += 1
    return {item for item, count in item_count.items() if count >= min_support_count}, item_count

def apriori(transactions, min_support, min_confidence):
    """Apriori算法实现"""
    '''数据预处理'''

    transactions = [set(transaction) for transaction in transactions]#将每条交易记录转化为集合
    min_support_count = int(min_support * len(transactions))#计算支持数阈值

    # 生成1项频繁项集
    items = set(item for transaction in transactions for item in transaction)
    '''
    等价于
    items = set()
    for transaction in transactions:
    for item in transaction:
        items.add(item)'''
    candidates = {frozenset([item]) for item in items}#将每一个item包装为frozenset放入candidatesz中
    frequent_itemsets, support_data = prune_candidates(candidates, transactions, min_support_count)

    all_frequent_itemsets = frequent_itemsets.copy()
    k = 2

    # 迭代生成k项频繁项集
    while frequent_itemsets:
        candidates = generate_candidates(frequent_itemsets, k)
        frequent_itemsets, item_support = prune_candidates(candidates, transactions, min_support_count)
        support_data.update(item_support)
        all_frequent_itemsets.update(frequent_itemsets)
        k += 1

    # 挖掘强关联规则
    rules = []
    for itemset in all_frequent_itemsets:
        #2205010216
        if len(itemset) > 1:  # 只考虑包含多个元素的项集
            for length in range(1, len(itemset)):  # 对每个项集，生成所有可能长度的结论组合（从1到len(itemset)-1）
                for consequence in combinations(itemset, length):  # 使用combinations生成长度为length的结论组合
                    antecedent = itemset - set(consequence)
                    confidence = support_data[itemset] / support_data[antecedent]
                    if confidence >= min_confidence:
                        rules.append((antecedent, set(consequence), confidence))

    return all_frequent_itemsets, rules  # 返回所有频繁项集和生成的规则


# 输入交易记录
data = [
    ["A", "B", "C", "D", "E", "F", "G"],
    ["A", "B", "C", "D", "E", "H"],
    ["A", "B", "C", "D", "E", "F", "G", "H"],
    ["A", "B", "C", "G", "H"],
    ["A", "B", "C", "D", "G", "H"],
    ["A", "B", "C", "D", "E", "F", "G", "H"],
    ["A", "B", "C", "D", "E", "F", "G"],
    ["A", "B", "C", "E", "G", "H"],
    ["A", "B", "C", "D", "E", "F", "H"],
    ["C", "D", "E", "F", "G", "H"],
    ["A", "B", "C", "D", "G", "H"],
    ["A", "C", "D", "E", "F", "G", "H"],
    ["A", "B", "C", "E", "F", "G", "H"],
    ["B", "C", "E", "F", "G", "H"],
]

min_support = 0.5
min_confidence = 0.5

frequent_itemsets, rules = apriori(data, min_support, min_confidence)

# 输出结果
print("频繁项集:")
for itemset in sorted(frequent_itemsets, key=lambda x: (len(x), x)): #先按照频繁项集的长度排序（升序）。如果长度相同，再按照字典序排序。
    print(itemset)

print("\n强关联规则:")
for antecedent, consequence, confidence in rules:
    print(f"{set(antecedent)} => {consequence} (置信度: {confidence:.2f})")

/*
#include <iostream>
#include <vector>
#include <cmath>
#include <algorithm>
#include <map>
created by grey
*/

#include <bits/stdc++.h>
using namespace std;
// 定义一个结构体来存储同学的身高和等级
struct Student {
double height;
string grade;
Student (double h,string g):height(h),grade(g){}
};
//created by grey
double Distance(double height1, double height2) {
    return abs(height1-height2);
}

// KNN算法
string knn(const vector<Student>& students, double newHeight, int k) {
    map<string, int> gradeCount;
    vector<pair<double,string> > distances;

    // 计算新同学与每个同学之间的距离，并记录等级
    for (size_t i = 0; i < students.size(); ++i) {
        const Student& student = students[i];
        double distance = Distance(newHeight, student.height);
        distances.push_back(make_pair(distance, student.grade));
        //2205010216
    }


    // 根据距离排序
    sort(distances.begin(), distances.end());

    // 选取最近的k个邻居
    for (int i = 0; i < k; ++i) {
        gradeCount[distances[i].second]++;
        cout<<distances[i].second<<endl;
    }

    // 选出出现次数最多的等级作为新同学的等级
    string answer;
    int maxCount = 0 ;
    for (map<string,int>::const_iterator it = gradeCount.begin(); it != gradeCount.end(); ++it) {
        if (it->second > maxCount) {
            answer = it->first;
            maxCount = it->second;
        }
    }
    return answer;
}

int main() {
    // 已登记的同学数据
    vector<Student> students;
    students.push_back(Student(1.5, "矮"));
    students.push_back(Student(1.92, "高"));
    students.push_back(Student(1.7, "中等"));
    students.push_back(Student(1.73, "中等"));
    students.push_back(Student(1.6, "矮"));
    students.push_back(Student(1.75, "中等"));
    students.push_back(Student(1.6, "矮"));
    students.push_back(Student(1.9, "高"));
    students.push_back(Student(1.68, "中等"));
    students.push_back(Student(1.78, "中等"));
    students.push_back(Student(1.7, "中等"));
    students.push_back(Student(1.68, "中等"));
    students.push_back(Student(1.65, "矮"));
    students.push_back(Student(1.78, "中等"));

    // 新同学易昌的身高
    double newHeight = 1.74;

    // 使用KNN算法确定新同学的等级
    string grade = knn(students, newHeight, 5);

    cout << "新同学易昌的身高等级是: " << grade << std::endl;

    return 0;
}

import math
from collections import Counter


# 计算熵
def entropy(data):
    total = len(data) #样本总数
    counts = Counter(data) #统计类别频次
    entropy_value = 0   #初始化熵值
    for label in counts.values(): #遍历每个类别
        prob = label / total
        entropy_value -= prob * math.log2(prob)
    return entropy_value


# 计算条件熵
def conditional_entropy(data, feature_index):
    total = len(data)
    feature_values = [row[feature_index] for row in data] #获取指定特征的所有取值
    feature_value_counts = Counter(feature_values)
    cond_entropy = 0
    for value, count in feature_value_counts.items():
        subset = [row[-1] for row in data if row[feature_index] == value]  #根据特征值划分子类
        cond_entropy += (count / total) * entropy(subset)
    return cond_entropy


# 计算信息增益
def information_gain(data, feature_index):
    total_entropy = entropy([row[-1] for row in data])  # 计算总熵
    cond_entropy = conditional_entropy(data, feature_index)  # 计算条件熵
    return total_entropy - cond_entropy  # 返回信息增益


# 找到信息增益最大的特征
def best_feature(data, features):
    max_gain = -1  # 初始化最大信息增益
    best_feature_index = -1  # 初始化最佳特征的索引
    for i in range(len(features) - 1):  # 最后一列是目标变量，不做特征选择
        gain = information_gain(data, i)  # 计算第 i 个特征的信息增益
        if gain > max_gain:  # 如果当前特征的信息增益大于最大信息增益
            max_gain = gain  # 更新最大信息增益
            best_feature_index = i  # 更新最佳特征的索引
    return best_feature_index  # 返回最佳特征的索引


# 构建决策树
def id3(data, features):
    labels = [row[-1] for row in data] #提取目标样本的最后一列：即是否买电脑
    # 如果所有样本属于同一类，则返回该类
    if len(set(labels)) == 1:
        return labels[0]

    # 如果没有特征可以划分了，返回数据中最常见的类
    if len(features) == 1:  # 只剩下目标列
        return Counter(labels).most_common(1)[0][0]

    # 找到信息增益最大的特征
    best_feature_index = best_feature(data, features)
    best_feature_name = features[best_feature_index]  #通过索引返回具体特征
    tree = {best_feature_name: {}}

    # 按最佳特征划分数据集
    feature_values = set(row[best_feature_index] for row in data) #提取当前最佳特征的所有可能取值
    for value in feature_values:
        # 创建子数据集
        subset = [row for row in data if row[best_feature_index] == value] #便于后面的递归
        # 递归构建子树
        subtree = id3([row[:best_feature_index] + row[best_feature_index + 1:] for row in subset],
                      features[:best_feature_index] + features[best_feature_index + 1:]) #提取样本中从开头到最佳特征索引之前的所有特征值加提取最佳特征索引之后的所有特征值。
        tree[best_feature_name][value] = subtree #将子树 subtree 作为当前最佳特征取值 value 的对应子节点添加到决策树中
#2205010216
    return tree


# 测试数据
data = [
    ['青', '高', '否', '良', '不买'],
    ['青', '高', '否', '优', '不买'],
    ['中', '高', '否', '良', '买'],
    ['老', '中', '否', '良', '买'],
    ['老', '低', '是', '良', '买'],
    ['老', '低', '是', '优', '不买'],
    ['中', '低', '是', '优', '买'],
    ['青', '中', '否', '良', '不买'],
    ['青', '低', '是', '良', '买'],
    ['老', '中', '是', '良', '买'],
    ['青', '中', '是', '优', '买'],
    ['中', '中', '否', '优', '买'],
    ['中', '高', '是', '良', '买'],
    ['老', '中', '否', '优', '不买'],
    ['老', '中', '否', '优', '买'],
]

# 特征名称
features = ['年龄', '收入', '学生', '信誉', '是否买电脑']

# 构建决策树
tree = id3(data, features)

# 打印决策树
import pprint

pprint.pprint(tree)

该算法为随机算法，选取的随机点不同的话会导致结果不同，下面是多次测试最常见的一种情况

import numpy as np


# 欧氏距离计算函数
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2)) # 欧氏距离公式：sqrt(sum((a_i - b_i)^2))


# K-means聚类算法

def kmeans(data, k, max_iterations=100):#最大迭代次数设置为100次
    # 随机选择k个初始中心点，np.random.choice选择k个不重复的索引作为初始簇中心
    centers = data[np.random.choice(data.shape[0], k, replace=False)]#data.shape[0]即数据的个数 在此处为13 replace=False使得选取的点不重复
    # 用于保存每次迭代后簇中心的变化情况，初始化为零
    prev_centers = np.zeros_like(centers)

    # 用于记录每个点所属的簇
    labels = np.zeros(data.shape[0])

    for _ in range(max_iterations):
        # 分配每个数据点到最近的簇
        for i in range(data.shape[0]):
            distances = [euclidean_distance(data[i], center) for center in centers]
            labels[i] = np.argmin(distances)#通过 np.argmin(distances) 找到距离最近的簇中心，并将当前数据点分配给该簇。np.argmin 返回的是最小值的索引，即最近簇的编号。


        # 记录当前簇中心
        prev_centers = centers.copy()

        # 重新计算每个簇的中心
        for i in range(k):
            cluster_points = data[labels == i]#通过布尔表达式判断是否属于该簇
            if len(cluster_points) > 0:
                centers[i] = np.mean(cluster_points, axis=0)#计算簇的平均值作为簇的中心

        # 判断是否收敛 如果收敛则停止迭代
        if np.all(centers == prev_centers):
            break

    return centers, labels


# 输入数据
data = np.array([
    [1, 2], [2, 1], [2, 4], [4, 3], [5, 8], [6, 7], [6, 9], [7, 9],
    [9, 5], [1, 12], [3, 12], [5, 12], [3, 3]
])
k = 3
# 执行K-means算法
centers, labels = kmeans(data, k)

# 输出每个簇及其包含的点
clusters = {i: [] for i in range(k)}
for i, label in enumerate(labels):
    clusters[int(label)].append((f"P{i + 1}", data[i]))
#2205010216
# 打印结果
for cluster_num, points in clusters.items():
    print(f"Cluster {cluster_num + 1}:")
    for point_label, point_coords in points:
        print(f"  {point_label} -> {point_coords}")

个人认为采用书上的矩阵算法过于复杂

参考了博客园的文章得到以下的求解方式：

这里计算出来应该是3/9 2/9 2/9 2/9

但是我在这采用的是小数，所以存在误差，诸位可以利用分子分母存储的形式进行计算。

接下来的代码导入了计算机图形的常用包 networkx

实验结果如下：

import networkx as nx


def page_rank(dg, damping_factor=0.85, max_iterations=100, min_delta=1e-7):
    """计算有向图的 PageRank 值"""

    # 先将图中没有出链的节点改为对所有节点都有出链
    for node in dg.nodes:
        if dg.out_degree(node) == 0:  # 如果节点没有出链
            for node2 in dg.nodes:
                dg.add_edge(node, node2)  # 为该节点添加指向所有其他节点的边

    nodes = dg.nodes
    graph_size = len(nodes)

    if graph_size == 0:
        return {}

    # 初始化每个节点的 PageRank 值
    page_rank = {node: 1.0 / graph_size for node in nodes}
    damping_value = (1.0 - damping_factor) / graph_size  # 公式中的(1−α)/N部分

    for i in range(max_iterations):
        change = 0
        for node in nodes:
            rank = 0
            # 计算入射页面的 PR 值，并累加
            for incident_page in dg.predecessors(node):  # 遍历所有“入射”的页面
                rank += damping_factor * (page_rank[incident_page] / dg.out_degree(incident_page))
            rank += damping_value  # 加上平滑因子部分
            change += abs(page_rank[node] - rank)  # 绝对值
            page_rank[node] = rank

        print(f"第{i + 1}次迭代结果 ")
        print(page_rank)

        if change < min_delta:
            print(f"finished in {i + 1} iterations!")
            break
    else:
        print("finished out of 100 iterations!")

    return page_rank

#2205010216
# 创建图
G = nx.DiGraph()

# 添加节点
G.add_nodes_from(["A", "B", "C", "D"])

# 添加边（表示网页之间的链接）
G.add_edges_from([("A", "B"), ("A", "C"), ("A", "D"),
                  ("B", "D"), ("B", "A"), ("C", "A"),
                  ("D", "C"), ("D", "B")])

# 计算 PageRank
page_ranks = page_rank(G)

# 输出最终的 PageRank
print("The final page rank is\n", page_ranks)