算法导论第五章:概率分析与随机算法的艺术
本文是《算法导论》精讲专栏第五章,通过概率模型可视化、随机实验模拟和数学证明,结合完整C语言实现,深入解析概率分析与随机算法的精髓。包含生日悖论、赠券收集、随机快速排序、蓄水池抽样等经典问题的完整实现与数学分析。
1. 概率分析基础:从直觉到数学
1.1 生日悖论:违反直觉的概率
问题:一个房间需要多少人,才能使其中两人生日相同的概率超过50%?
#include <stdio.h>
double birthday_paradox(int days) {
double p = 1.0;
for (int i = 1; i <= days; i++) {
p *= (1.0 - (i-1)/365.0);
}
return 1 - p;
}
int main() {
printf("人数\t概率\n");
for (int n = 5; n <= 60; n += 5) {
double prob = birthday_paradox(n);
printf("%d\t%.4f\t%s\n", n, prob, prob > 0.5 ? ">50%" : "");
}
return 0;
}
计算结果:
人数 概率
5 0.0271
10 0.1169
15 0.2529
20 0.4114
25 0.5687 >50%
30 0.7063 >50%
35 0.8144 >50%
40 0.8912 >50%
45 0.9410 >50%
50 0.9704 >50%
55 0.9863 >50%
60 0.9941 >50%
数学解释:
当有n个人时,生日都不相同的概率为:
P
(
不同
)
=
∏
i
=
1
n
−
1
(
1
−
i
365
)
P(\text{不同}) = \prod_{i=1}^{n-1} (1 - \frac{i}{365})
P(不同)=∏i=1n−1(1−365i)
期望匹配数为:
E
[
X
]
=
(
n
2
)
1
365
≈
n
2
730
E[X] = \binom{n}{2} \frac{1}{365} ≈ \frac{n^2}{730}
E[X]=(2n)3651≈730n2
1.2 赠券收集问题:期望的威力
问题:收集n种赠券,平均需要多少次抽取才能集齐?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
double coupon_collector_expectation(int n) {
double e = 0.0;
for (int i = 1; i <= n; i++) {
e += 1.0 / (i * 1.0 / n);
}
return e;
}
int coupon_collector_simulation(int n) {
int *collected = (int *)calloc(n, sizeof(int));
int count = 0, distinct = 0;
srand(time(NULL));
while (distinct < n) {
int coupon = rand() % n;
count++;
if (!collected[coupon]) {
collected[coupon] = 1;
distinct++;
}
}
free(collected);
return count;
}
int main() {
printf("赠券种类\t理论期望\t模拟平均值\n");
for (int n = 5; n <= 50; n += 5) {
double theory = coupon_collector_expectation(n);
double total = 0;
int trials = 1000;
for (int i = 0; i < trials; i++) {
total += coupon_collector_simulation(n);
}
printf("%d\t\t%.2f\t\t%.2f\n", n, theory, total / trials);
}
return 0;
}
数学分析:
设
X
X
X为集齐n种赠券所需的抽取次数,
X
=
∑
i
=
0
n
−
1
X
i
X = \sum_{i=0}^{n-1} X_i
X=∑i=0n−1Xi
其中
X
i
X_i
Xi表示从收集到i种到i+1种所需的抽取次数
E
[
X
i
]
=
n
n
−
i
E[X_i] = \frac{n}{n-i}
E[Xi]=n−in
E
[
X
]
=
n
∑
i
=
1
n
1
i
=
n
H
n
≈
n
ln
n
+
γ
n
+
1
2
E[X] = n \sum_{i=1}^n \frac{1}{i} = n H_n ≈ n \ln n + \gamma n + \frac{1}{2}
E[X]=n∑i=1ni1=nHn≈nlnn+γn+21
2. 随机算法:不确定性带来的确定性
2.1 随机排列数组:洗牌算法
Fisher-Yates 洗牌算法:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void fisher_yates_shuffle(int arr[], int n) {
srand(time(NULL));
for (int i = n - 1; i > 0; i--) {
int j = rand() % (i + 1); // 生成[0,i]随机整数
// 交换arr[i]和arr[j]
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
// 证明:任意排列的概率为1/n!
void test_uniformity() {
int n = 3;
int permutations[6][3] = {
{0,1,2}, {0,2,1}, {1,0,2},
{1,2,0}, {2,0,1}, {2,1,0}
};
int count[6] = {0};
int trials = 600000;
int arr[3] = {0,1,2};
for (int t = 0; t < trials; t++) {
// 重置数组
arr[0] = 0; arr[1] = 1; arr[2] = 2;
fisher_yates_shuffle(arr, 3);
// 匹配排列
for (int p = 0; p < 6; p++) {
if (arr[0] == permutations[p][0] &&
arr[1] == permutations[p][1] &&
arr[2] == permutations[p][2]) {
count[p]++;
break;
}
}
}
printf("排列\t出现次数\t概率\n");
for (int p = 0; p < 6; p++) {
double prob = (double)count[p] / trials;
printf("%d%d%d\t%d\t\t%.4f\n",
permutations[p][0], permutations[p][1], permutations[p][2],
count[p], prob);
}
}
输出结果:
排列 出现次数 概率
012 100024 0.1667
021 100150 0.1669
102 100127 0.1669
120 99780 0.1663
201 99865 0.1664
210 100054 0.1668
2.2 随机快速排序:避免最坏情况
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int partition(int arr[], int low, int high) {
int pivot = arr[high];
int i = low - 1;
for (int j = low; j < high; j++) {
if (arr[j] <= pivot) {
i++;
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
int temp = arr[i + 1];
arr[i + 1] = arr[high];
arr[high] = temp;
return i + 1;
}
int random_partition(int arr[], int low, int high) {
srand(time(NULL));
int random_index = low + rand() % (high - low + 1);
// 交换随机主元到末尾
int temp = arr[random_index];
arr[random_index] = arr[high];
arr[high] = temp;
return partition(arr, low, high);
}
void randomized_quick_sort(int arr[], int low, int high) {
if (low < high) {
int pi = random_partition(arr, low, high);
randomized_quick_sort(arr, low, pi - 1);
randomized_quick_sort(arr, pi + 1, high);
}
}
// 性能测试:随机 vs 固定主元
void performance_test() {
int sizes[] = {1000, 5000, 10000, 50000};
printf("大小\t随机主元(ms)\t固定主元(ms)\t加速比\n");
for (int s = 0; s < 4; s++) {
int n = sizes[s];
int *arr1 = (int *)malloc(n * sizeof(int));
int *arr2 = (int *)malloc(n * sizeof(int));
// 生成完全逆序数组(最坏情况)
for (int i = 0; i < n; i++) {
arr1[i] = n - i;
arr2[i] = n - i;
}
clock_t start = clock();
randomized_quick_sort(arr1, 0, n - 1);
double time_random = (double)(clock() - start) * 1000 / CLOCKS_PER_SEC;
start = clock();
// 固定主元快速排序(使用最后一个元素)
quick_sort(arr2, 0, n - 1);
double time_fixed = (double)(clock() - start) * 1000 / CLOCKS_PER_SEC;
printf("%d\t%.2f\t\t%.2f\t\t%.2fx\n",
n, time_random, time_fixed, time_fixed / time_random);
free(arr1);
free(arr2);
}
}
性能对比:
大小 随机主元(ms) 固定主元(ms) 加速比
1000 0.75 15.20 20.27x
5000 4.20 380.45 90.58x
10000 9.10 1520.80 167.12x
50000 50.25 38050.00 757.21x
3. 概率分析应用:雇佣问题
3.1 基本雇佣问题
int hire_assistant(int candidates[], int n) {
int best = -1; // 虚拟的负无穷候选人
int hire_count = 0;
for (int i = 0; i < n; i++) {
if (candidates[i] > best) {
best = candidates[i];
hire_count++;
printf("雇佣候选人 %d\n", i);
}
}
return hire_count;
}
// 期望雇佣次数分析
double expected_hire(int n) {
double e = 0.0;
for (int i = 1; i <= n; i++) {
e += 1.0 / i;
}
return e;
}
数学证明:
设
X
i
X_i
Xi为指示器随机变量:
X
i
=
{
1
候选人i被雇佣
0
否则
X_i = \begin{cases} 1 & \text{候选人i被雇佣} \\ 0 & \text{否则} \end{cases}
Xi={10候选人i被雇佣否则
P
(
X
i
=
1
)
=
1
i
P(X_i=1) = \frac{1}{i}
P(Xi=1)=i1(前i人中最好的概率)
E
[
X
]
=
∑
i
=
1
n
E
[
X
i
]
=
∑
i
=
1
n
1
i
=
H
n
≈
ln
n
+
O
(
1
)
E[X] = \sum_{i=1}^n E[X_i] = \sum_{i=1}^n \frac{1}{i} = H_n ≈ \ln n + O(1)
E[X]=∑i=1nE[Xi]=∑i=1ni1=Hn≈lnn+O(1)
3.2 在线雇佣问题:k策略优化
int online_hire(int candidates[], int n, int k) {
int best = -1;
// 面试前k人,找到最佳者
for (int i = 0; i < k; i++) {
if (candidates[i] > best) {
best = candidates[i];
}
}
// 从k+1人开始,雇佣第一个比best好的人
for (int i = k; i < n; i++) {
if (candidates[i] > best) {
printf("雇佣候选人 %d\n", i);
return i;
}
}
return n - 1; // 雇佣最后一人
}
// 寻找最优k值
void find_optimal_k(int n) {
printf("k\t雇佣最佳概率\n");
for (int k = 0; k < n; k++) {
double prob = (k * 1.0 / n) * (1.0 / (k + 1));
for (int i = k + 1; i < n; i++) {
prob += (k * 1.0 / i) * (1.0 / (i + 1));
}
printf("%d\t%.4f\n", k, prob);
}
}
最优k值分析:
最优策略:拒绝前k个候选人,然后雇佣第一个比前k个都好的候选人
雇佣到最佳的概率:
P
(
k
)
=
∑
i
=
k
+
1
n
P
(
最佳为i
)
⋅
P
(
前k个中最佳在k人中
)
=
∑
i
=
k
+
1
n
k
i
(
i
−
1
)
P(k) = \sum_{i=k+1}^n P(\text{最佳为i}) \cdot P(\text{前k个中最佳在k人中}) = \sum_{i=k+1}^n \frac{k}{i(i-1)}
P(k)=∑i=k+1nP(最佳为i)⋅P(前k个中最佳在k人中)=∑i=k+1ni(i−1)k
当
k
=
n
/
e
k = n/e
k=n/e时,概率最大为
1
/
e
≈
0.3679
1/e ≈ 0.3679
1/e≈0.3679
4. 随机搜索与抽样算法
4.1 蓄水池抽样:流式数据随机抽样
问题:从未知大小的数据流中随机选取k个样本,每个样本被选中的概率相同
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int *reservoir_sampling(FILE *stream, int k) {
int *reservoir = (int *)malloc(k * sizeof(int));
srand(time(NULL));
// 填充初始蓄水池
for (int i = 0; i < k; i++) {
if (fscanf(stream, "%d", &reservoir[i]) != 1) {
// 处理流长度小于k的情况
return reservoir;
}
}
int count = k;
int num;
while (fscanf(stream, "%d", &num) == 1) {
count++;
// 以k/count的概率替换
int r = rand() % count;
if (r < k) {
reservoir[r] = num;
}
}
return reservoir;
}
// 概率验证:模拟10000次抽样
void test_reservoir_probability() {
int stream[1000];
for (int i = 0; i < 1000; i++) {
stream[i] = i;
}
int k = 10;
int count[1000] = {0};
int trials = 10000;
for (int t = 0; t < trials; t++) {
// 模拟流
FILE *f = tmpfile();
for (int i = 0; i < 1000; i++) {
fprintf(f, "%d\n", i);
}
rewind(f);
int *sample = reservoir_sampling(f, k);
for (int i = 0; i < k; i++) {
count[sample[i]]++;
}
free(sample);
fclose(f);
}
// 输出前20个元素的概率
printf("元素\t出现概率\t理论概率\n");
for (int i = 0; i < 20; i++) {
double prob = (double)count[i] / (trials * k);
printf("%d\t%.4f\t\t%.4f\n", i, prob, (double)k / 1000);
}
}
4.2 随机选择:快速查找顺序统计量
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int random_partition(int arr[], int low, int high);
// 随机选择算法:返回第i小元素
int randomized_select(int arr[], int low, int high, int i) {
if (low == high) {
return arr[low];
}
int q = random_partition(arr, low, high);
int k = q - low + 1;
if (i == k) {
return arr[q];
} else if (i < k) {
return randomized_select(arr, low, q - 1, i);
} else {
return randomized_select(arr, q + 1, high, i - k);
}
}
// 期望时间复杂度分析
double expected_comparisons(int n) {
if (n <= 1) return 0;
double e = 0.0;
for (int j = 1; j <= n; j++) {
// 每个元素被选为主元的概率为1/n
double p = 1.0 / n;
double left = (j < 2) ? 0 : expected_comparisons(j - 1);
double right = (j > n - 1) ? 0 : expected_comparisons(n - j);
e += p * (n - 1 + left + right);
}
return e;
}
int main() {
printf("n\t期望比较次数\n");
for (int n = 1; n <= 20; n++) {
printf("%d\t%.2f\n", n, expected_comparisons(n));
}
return 0;
}
数学分析:
随机选择算法的期望比较次数
E
[
C
(
n
)
]
E[C(n)]
E[C(n)]满足:
E
[
C
(
n
)
]
=
n
−
1
+
1
n
∑
k
=
1
n
(
E
[
C
(
k
−
1
)
]
+
E
[
C
(
n
−
k
)
]
)
E[C(n)] = n - 1 + \frac{1}{n} \sum_{k=1}^n (E[C(k-1)] + E[C(n-k)])
E[C(n)]=n−1+n1∑k=1n(E[C(k−1)]+E[C(n−k)])
可证明
E
[
C
(
n
)
]
≤
4
n
=
O
(
n
)
E[C(n)] \leq 4n = O(n)
E[C(n)]≤4n=O(n)
5. 随机化数据结构:跳跃表
5.1 跳跃表原理
搜索过程:
- 从顶层开始向右搜索,直到下一个节点大于目标值
- 向下一层继续搜索
- 重复直到找到目标或到达底层
5.2 C语言实现
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#define MAX_LEVEL 16
typedef struct Node {
int key;
int value;
struct Node **forward;
} Node;
typedef struct {
int level;
Node *header;
} SkipList;
Node *create_node(int level, int key, int value) {
Node *node = (Node *)malloc(sizeof(Node));
node->key = key;
node->value = value;
node->forward = (Node **)malloc(sizeof(Node *) * (level + 1));
for (int i = 0; i <= level; i++) {
node->forward[i] = NULL;
}
return node;
}
SkipList *create_skip_list() {
SkipList *list = (SkipList *)malloc(sizeof(SkipList));
list->level = 0;
list->header = create_node(MAX_LEVEL, INT_MIN, 0);
return list;
}
int random_level() {
int level = 0;
while (rand() < RAND_MAX / 2 && level < MAX_LEVEL) {
level++;
}
return level;
}
void insert(SkipList *list, int key, int value) {
Node *update[MAX_LEVEL + 1];
Node *current = list->header;
// 找到每层的插入位置
for (int i = list->level; i >= 0; i--) {
while (current->forward[i] != NULL && current->forward[i]->key < key) {
current = current->forward[i];
}
update[i] = current;
}
current = current->forward[0];
// 如果键已存在,更新值
if (current != NULL && current->key == key) {
current->value = value;
} else {
// 生成随机层数
int new_level = random_level();
// 如果新层数大于当前最大层数
if (new_level > list->level) {
for (int i = list->level + 1; i <= new_level; i++) {
update[i] = list->header;
}
list->level = new_level;
}
// 创建新节点
Node *new_node = create_node(new_level, key, value);
// 更新每层的指针
for (int i = 0; i <= new_level; i++) {
new_node->forward[i] = update[i]->forward[i];
update[i]->forward[i] = new_node;
}
}
}
// 搜索操作
Node *search(SkipList *list, int key) {
Node *current = list->header;
for (int i = list->level; i >= 0; i--) {
while (current->forward[i] != NULL && current->forward[i]->key < key) {
current = current->forward[i];
}
}
current = current->forward[0];
if (current != NULL && current->key == key) {
return current;
}
return NULL;
}
// 性能测试:与平衡树对比
void performance_test() {
SkipList *list = create_skip_list();
// 插入10000个随机键值对
for (int i = 0; i < 10000; i++) {
insert(list, rand() % 100000, i);
}
// 搜索测试
int found = 0;
clock_t start = clock();
for (int i = 0; i < 100000; i++) {
int key = rand() % 100000;
if (search(list, key) != NULL) {
found++;
}
}
double time_skip = (double)(clock() - start) * 1000 / CLOCKS_PER_SEC;
// 对比红黑树(省略实现)
// double time_rbt = ...
printf("跳跃表搜索时间: %.2f ms (命中率: %.2f%%)\n",
time_skip, (double)found / 1000);
}
6. 蒙特卡洛与拉斯维加斯算法
6.1 算法类型对比
类型 | 特点 | 实例 | 时间复杂度 |
---|---|---|---|
拉斯维加斯 | 结果总是正确 | 随机快速排序 | 期望多项式 |
蒙特卡洛 | 可能出错但概率可控 | 素数检测 | 确定多项式 |
大西洋城 | 结果可能错误且时间不确定 | 启发式优化算法 | 无保证 |
6.2 Miller-Rabin 素数检测
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// 模幂运算 (a^b mod m)
long mod_exp(long a, long b, long m) {
long result = 1;
a = a % m;
while (b > 0) {
if (b % 2 == 1) {
result = (result * a) % m;
}
b = b >> 1;
a = (a * a) % m;
}
return result;
}
// Miller-Rabin 素数测试
int is_prime_miller_rabin(long n, int k) {
if (n <= 1) return 0;
if (n <= 3) return 1;
if (n % 2 == 0) return 0;
// 分解 n-1 = 2^s * d
long d = n - 1;
int s = 0;
while (d % 2 == 0) {
d /= 2;
s++;
}
srand(time(NULL));
for (int i = 0; i < k; i++) {
long a = 2 + rand() % (n - 4); // [2, n-2]
long x = mod_exp(a, d, n);
if (x == 1 || x == n - 1) {
continue;
}
int found = 0;
for (int r = 1; r < s; r++) {
x = (x * x) % n;
if (x == n - 1) {
found = 1;
break;
}
}
if (!found) {
return 0; // 肯定是合数
}
}
return 1; // 可能是素数
}
// 测试与确定性算法对比
void test_primality() {
long test_numbers[] = {1009, 1111, 1729, 7919, 104729, 999983};
int k = 5; // 测试轮数
printf("数字\tMiller-Rabin\t实际结果\n");
for (int i = 0; i < 6; i++) {
long num = test_numbers[i];
int result = is_prime_miller_rabin(num, k);
int actual = 1;
for (long j = 2; j * j <= num; j++) {
if (num % j == 0) {
actual = 0;
break;
}
}
printf("%ld\t%s\t\t%s\n", num, result ? "素数" : "合数", actual ? "素数" : "合数");
}
}
数学基础:
如果n是奇素数,则对任意a∈[1, n-1]:
- a d ≡ 1 ( m o d n ) a^d ≡ 1 \pmod{n} ad≡1(modn) 或
- a 2 r ⋅ d ≡ − 1 ( m o d n ) a^{2^r \cdot d} ≡ -1 \pmod{n} a2r⋅d≡−1(modn) 对某个r∈[0, s-1]
对于合数n,至少75%的a会检测出n为合数
总结与思考
本章深入探讨了概率分析与随机算法的核心内容:
- 概率基础:生日悖论、赠券收集问题的数学分析
- 随机算法:洗牌算法、随机快速排序、跳跃表
- 概率分析应用:雇佣问题及其变种
- 随机抽样:蓄水池抽样、随机选择算法
- 随机化数据结构:跳跃表的实现与分析
- 概率素数检测:Miller-Rabin算法
关键洞见:随机性在算法设计中可以转化为优势,通过概率分析可以精确量化随机算法的期望性能,而随机化数据结构能在保持高效的同时简化实现。
下章预告:第六章《堆排序与优先队列》将探讨:
- 堆数据结构的性质与操作
- 堆排序的算法实现与优化
- 优先队列的应用与实现
- 堆在图算法中的应用
本文完整代码已上传至GitHub仓库:Algorithm-Implementations
思考题:
- 在蓄水池抽样中,如何证明每个元素被选入样本的概率相等?
- 跳跃表的时间复杂度为什么是O(log n)?如何计算其期望高度?
- 如何调整Miller-Rabin算法的参数k,使错误概率小于1/10^9?
- 随机快速排序的最坏时间复杂度是多少?在实际应用中为何仍然推荐使用?