// SVM.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <cassert>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <memory.h>
void GetDataFromFile(DATAINFO* pData, const char *pFileName, const int nDataLength);
void GetSVMVector(DATAINFO* pData, int nDataLength, double* pVector, int nVectorLength);
void ClassTestData(DATAINFO* pData, int nDataLength, double* pVector, int nVectorLength, int* pTestType, DATASCORE *pDataScore);
void StatistClassResult(DATAINFO* pData, int nDataLength, int* pTestType, float *pResult);
float max(float a, float b);
float min(float a, float b);
void sort(DATASCORE *pDataScore, int nDataLength);
float GetAUC(DATASCORE *pDataScore, int nDataLength);
int _tmain(int argc, _TCHAR* argv[])
{
double dwVector[NUM_FEATURE] = {0};
int nTestTypeBuf[NUM_TEST_DATA] = {0};//测试样本数
DATAINFO *pData = new DATAINFO[NUM_TRAIN_DATA];
assert(pData);
const char strTrainFileName[] = "train.data";
//const char strTrainFileName[] = "1.data";
const char strTestFileName[] = "test.data";
GetDataFromFile(pData, strTrainFileName, NUM_TRAIN_DATA);
GetSVMVector(pData, NUM_TRAIN_DATA, dwVector, NUM_FEATURE);
if(pData) delete[] pData;
pData = new DATAINFO[NUM_TEST_DATA];
DATASCORE *pDataScore = new DATASCORE[NUM_TEST_DATA];
assert(pData && pDataScore);
GetDataFromFile(pData, strTestFileName, NUM_TEST_DATA);
ClassTestData(pData, NUM_TEST_DATA, dwVector, NUM_FEATURE, nTestTypeBuf, pDataScore);
sort(pDataScore, NUM_TEST_DATA);
float fResult[3] = {0};
StatistClassResult(pData, NUM_TEST_DATA, nTestTypeBuf, fResult);
printf("SVM分类器系数如下:\n");
for(int i = 0; i < NUM_FEATURE; i++)
{
printf("W[%d] = %f\n", i, dwVector[i]);
}
printf("True Positive rate:%f\nFalse Positive rate:%f\n总的分类正确率:%f\n", fResult[0], fResult[1], fResult[2]);
// printf("Min score = %f\nMax score = %f\n", pDataScore[0].score, pDataScore[NUM_TEST_DATA - 1].score);
printf("分类器AUC得分:%f\n", GetAUC(pDataScore, NUM_TEST_DATA));
if(pData)
{
delete[] pData;
pData = NULL;
}
if (pDataScore)
{
delete[] pDataScore;
pDataScore = NULL;
}
printf("Please input enter to quit!\n");
getchar();
return 0;
}
//从文件中读取样本数据(训练集和测试集都用此函数)
//pData:存储读取到的数据的指针
//pFileName:存储数据的文件名
//nDataLength:样本数目
void GetDataFromFile(DATAINFO* pData, const char *pFileName, const int nDataLength)
{
assert(pData&&pFileName);
FILE *pFile = fopen(pFileName, "r");
assert(pFile);
int i = 0;
char strLine[128] = {0};
char strTemp[16] = {0};
while(i < nDataLength)
{
memset(strLine, 0, 128);
fgets(strLine, 128, pFile);
int j = 0, k = 0, m = 0;
while(0 != strLine[j])
{
if(' ' == strLine[j] || 0X0A == strLine[j])
{
strTemp[k] = 0;
if(NUM_FEATURE == m)
{
pData[i].nType = atoi(strTemp);
m = 0;
}
else
{
pData[i].dwFeature[m] = atof(strTemp);
m++;
}
//ZeroMemory(strTemp, 16);
memset(strTemp, 0, 16);
j++;
k = 0;
}
else
{
strTemp[k++] = strLine[j++];
}
}
i++;
}
if(pFile)
{
fclose(pFile);
pFile = NULL;
}
}
//根据训练集样本得到SVM分类器系数函数
//pData:训练样本的数据指针
//nDataLength:训练样本数目
//pVector:SVM系数存储指针
//nVectorLength:SVM训练器系数数目(10)
void GetSVMVector(DATAINFO* pData, int nDataLength, double* pVector, int nVectorLength)
{
assert(pData && pVector);
float *pfWeight = new float[nDataLength];
assert(pfWeight);
float fLow, fHigh, fSum = 0.0, fOld;
const float fC = 0.5;
float fError1 = 0, fError2 = 0;
float fg1 = 0, fg2 = 0;
int nNum1, nNum2;
memset(pfWeight, 0, sizeof(float) * nDataLength);
srand((unsigned)time(NULL));
for(int i = 0; i < NUM_ITERATAR; i++)
{
fg1 = 0, fg2 = 0, fSum = 0;
nNum1 = rand() % nDataLength;
nNum2 = rand() % nDataLength;
if(nNum1 == nNum2)
continue;
if(pData[nNum1].nType == pData[nNum2].nType)
{
fLow = max(0, pfWeight[nNum2] + pfWeight[nNum1] - fC);
fHigh = min(fC, pfWeight[nNum2] + pfWeight[nNum1]);
}
else
{
fLow = max(0, pfWeight[nNum2] - pfWeight[nNum1]);
fHigh = min(fC, fC + pfWeight[nNum2] - pfWeight[nNum1]);
}
for(int j = 0; j < NUM_FEATURE; j++)
{
fSum += 2.0 * pData[nNum1].dwFeature[j] * pData[nNum2].dwFeature[j]
- pData[nNum1].dwFeature[j] * pData[nNum1].dwFeature[j]
- pData[nNum2].dwFeature[j] * pData[nNum2].dwFeature[j];
//fSum += (pData[nNum1].dwFeature[j] - pData[nNum2].dwFeature[j]) * (pData[nNum1].dwFeature[j] - pData[nNum2].dwFeature[j]);
fg1 = (float)(pVector[j] * pData[nNum1].dwFeature[j]);
fg2 = (float)(pVector[j] * pData[nNum2].dwFeature[j]);
}
fg1 += NUM_B_THRESHOLD;
fg2 += NUM_B_THRESHOLD;
fOld = pfWeight[nNum2];
pfWeight[nNum2] -= pData[nNum2].nType * (fError1 - fError2) / fSum;
float ff = pfWeight[nNum2];
fError1 = fg1 - pData[nNum1].nType;
fError2 = fg2 - pData[nNum2].nType;
if(pfWeight[nNum2] >= fHigh)
{
pfWeight[nNum2] = fHigh;
}
else if(pfWeight[nNum2] <= fLow)
{
pfWeight[nNum2] = fLow;
}
pfWeight[nNum1] += pData[nNum1].nType * pData[nNum2].nType * (fOld - pfWeight[nNum2]);
for(int j = 0; j < NUM_FEATURE; j++)
{
pVector[j] = 0;
for(int k = 0; k < nDataLength; k++)
{
pVector[j] += pfWeight[k] * pData[k].nType * pData[k].dwFeature[j];
}
}
if(0 == (i + 1)%1000)
{
printf("已经迭代%d次!\n", i + 1);
}
}
if(pfWeight)
{
delete[] pfWeight;
pfWeight = NULL;
}
}
//对测试集进行分类
//pData:测试集样本数据指针
//nDataLength:测试集样本总数
//pVector:SVM分类器系数指针
//nVectorLength:SVM训练器系数数目(10)
//pTestType:存放测试集分类类型的指针
//pDataScore:存储测试集样本得分及测试集样本类型的指针
void ClassTestData(DATAINFO* pData, int nDataLength, double* pVector, int nVectorLength, int* pTestType, DATASCORE *pDataScore)
{
assert(pData && pVector && pTestType);
double dwSum = 0;
for(int i = 0; i < nDataLength; i++)
{
for(int j = 0; j < nVectorLength; j++)
{
dwSum += pVector[j] * pData[i].dwFeature[j] ;
}
dwSum += NUM_B_THRESHOLD;
if( dwSum > 0)
{
pTestType[i] = 1;
}
else
{
pTestType[i] = -1;
}
pDataScore[i].score = dwSum;
pDataScore[i].nType = pData[i].nType;
}
}
//统计分类器分类正确率
//pData:测试集样本数据指针
//nDataLength:测试集样本总数
//pVector:SVM分类器系数指针
//pResult:存放分类正确率 pResult[0](True Positive rate) pResult[1](False Positive rate)
//pResult[2](总的分类正确率)
void StatistClassResult(DATAINFO* pData, int nDataLength, int* pTestType, float *pResult)
{
assert(pData && pTestType);
int sum1 = 0, sum2 = 0, nType1Num = 0;
for(int i = 0; i < nDataLength; i++)
{
if(pTestType[i] == pData[i].nType)
{
if(1 == pTestType[i])
{
sum1++;
}
else
{
sum2++;
}
}
if(1 == pData[i].nType)
nType1Num++;
}
pResult[0] = (float)sum1/nType1Num;
pResult[1] = (float)sum2/(nDataLength - nType1Num);
pResult[2] = (float)(sum1 + sum2)/nDataLength;
}
//求二者中较大者
float max(float a, float b)
{
return a > b ? a : b;
}
//求二者中较小者
float min(float a, float b)
{
return a < b ? a : b;
}
//对测试集样本得分进行排序,用于AUC的计算
//pDataScore:测试集样本得分存储指针
//nDataLength:测试集样本总数
void sort(DATASCORE *pDataScore, int nDataLength)
{
assert(pDataScore);
DATASCORE ScoreTemp;
for(int i = 0; i < nDataLength; i++)
{
for(int j = i + 1; j < nDataLength; j++)
{
if(pDataScore[i].score > pDataScore[j].score)
{
ScoreTemp = pDataScore[i];
pDataScore[i] = pDataScore[j];
pDataScore[j] = ScoreTemp;
}
}
}
}
//计算AUC得分
//pDataScore:测试集样本得分存储指针
//nDataLength:测试集样本总数
float GetAUC(DATASCOR