正态分布3σ原则,把3倍方差之外的点设想为噪声数据来排除。
归一化,将数据经过处理之后限定到一定的范围内,一般都会将数据限定到[0,1]。
#include <iostream> #include <string> #include <vector> #include <algorithm> #include <numeric> #include <cmath> #include <fstream> #include <sstream>
template <class DataType> void ReadDataFromFile(std::string &filename, std::vector<std::vector<DataType> > &lines_feat) { std::ifstream vm_info(filename.c_str()); std::string lines; DataType var; std::vector<DataType> row;
lines_feat.clear();
while(!vm_info.eof()) { getline(vm_info, lines); if(lines.empty()) break; std::stringstream stringin(lines); row.clear();
while(stringin >> var) { row.push_back(var); } lines_feat.push_back(row); } }
template <class DataType> void Display2DVector(std::vector<std::vector<DataType> > &vv) { std::cout<<"the total rows of 2d vector_data: "<<vv.size()<<"\n";
for(size_t i=0;i<vv.size();++i) { for(typename::std::vector<DataType>::const_iterator it=vv[i].begin();it!=vv[i].end();++it) { std::cout<<*it<<" "; } std::cout<<"\n"; } std::cout<<"--------the end of the Display2DVector()--------\n"; }
template <class DataType> void ProcessVector(std::vector<std::vector<DataType> > &vv) { std::vector<double> temp; double u[3]={0.0}, sum[3]={0.0}, sigma[3]={0.0}; for(size_t j=0; j<3; ++j) { temp.clear(); for(size_t i=0; i<vv.size(); ++i) { temp.push_back(vv[i][j]); } sum[j]=std::accumulate(temp.begin(), temp.end(), 0); u[j]=sum[j]/vv.size(); }
for(size_t j=0;j<3;++j) { temp.clear(); sum[j]=0.0; for(size_t i=0;i<vv.size();++i) { temp.push_back(std::pow(vv[i][j]-u[j], 2.0)); } sum[j]=std::accumulate(temp.begin(), temp.end(), 0.0); sigma[j]=sum[j]/vv.size(); sigma[j]=sqrt(sigma[j]); }
double MaxValue[3]={0.0}, MinValue[3]={0.0}; for(size_t j=0;j<3;++j) { temp.clear(); for(size_t i=0;i<vv.size();++i) { if((vv[i][j]>(u[j]-3*sigma[j])) && (vv[i][j]<(u[j]+3*sigma[j]))) { std::cout<<vv[i][j]<<" "; temp.push_back(vv[i][j]); } } std::cout<<"\n"; MaxValue[j]=*std::max_element(temp.begin(), temp.end()); MinValue[j]=*std::min_element(temp.begin(), temp.end()); }
for(size_t j=0;j<3;++j) { for(size_t i=0;i<vv.size();++i) { if((vv[i][j]>(u[j]-3*sigma[j])) && (vv[i][j]<(u[j]+3*sigma[j]))) { std::cout<<(vv[i][j]-MinValue[j])/(MaxValue[j]-MinValue[j])<<" "; } } std::cout<<"\n"; } }
int main() { std::vector<std::vector<int> > lines_feat; std::string filename="vm.data";
/*read data from file to 2d vector*/ ReadDataFromFile(filename, lines_feat);
/*display the raw data*/ Display2DVector(lines_feat);
/*process the data*/ ProcessVector(lines_feat);
std::cout<<"--------The end of main()--------\n";
return 0; }
源数据如下(cat vm.data):
19 26 63 13 62 65 16 69 15 14 56 17 19 6 15 11 42 15 18 58 36 12 77 33 10 75 47 15 54 70 10017 1421077 4196
|
请发表评论