2020_MCM_Problem_C 数据的读取及相关操作

最新推荐文章于 2024-04-27 16:22:04 发布

Hello World : )

最新推荐文章于 2024-04-27 16:22:04 发布

阅读量951

点赞数 2

分类专栏：笔记文章标签： c++ 美国大学生数学建模竞赛

本文链接：https://blog.csdn.net/qq_44648883/article/details/104814511

版权

笔记专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1.关于.tsv文件

TSV：tab separated values；即“制表符分隔值”，即每行的不同数据之间是使用"Tab"进行分割的。如：

姓名	年龄	身高	...
小红	10	166	...
小绿	15	175	...
小花	20	164	...
......

C题中的数据也是这种格式

2.关于题目中给出的数据

题目中给出的数据好像是直接从网站上爬下来的（在某些数据中发现了如 HTML中换行符之类的字符），有的数据行中含有一些特殊的字符会影响程序的读入，由于数量极少，直接手动剔除了，没有写相关的程序来实现无效数据的剔除。

3.根据给出的数据设计相应的类

FEADDATA.h

#pragma once
#include<string>
#include"DATE.h"
class FEADDATA
{
//由于时间紧张，我直接把这些数据权限都设置为了public ，也不用再写get,set 之类的函数了,haha@~_~@
public:
 std::string marketplace;      //2 letter country code of the marketplace where the review was written.
 std::string customer_id;   //Random identifier that can be used to aggregate reviews 
                               //written by a single author.
 std::string review_id;    //The unique ID of the review.
 std::string product_id;    //The unique Product ID the review pertains to.
 std::string product_parent;   //Random identifier that can be used to aggregate reviews for 
                               //the same product.
 std::string product_title;   //Title of the product.
 std::string product_category; //The major consumer category for the product.
 int  star_rating;     //The 1-5 star rating of the review.
 int  helpful_votes;     //Number of helpful votes.
 int  total_votes;     //Number of total votes the review received.
 std::string vine;     
 std::string verified_purchase;// A “Y” indicates Amazon verified that the person writing the
                               //review purchased the product at Amazon and didn't receive the
                               //product at a deep discount.
 std::string review_headline;  //he title of the review.
 std::string review_body;   //The review text.
 int LengthofReview_body;      //length of review text.
 int review_date;              //The date the review was written.
 DATE X_review_date;           //The date the review was written.
 std::string review_date_ori;  //The date the review was written.
 //此处的时间类型我定义了三种，一种是string类型的，一种是int类型的，还有一种是自定义的DATE 类型的（下面会给出）
public:
 //第二部分 这是问题具体实现过程中的一些函数，和本文数据读取无关
 int V_BFlag;
 int LevelOfHelpfv;
 int LevelOfRevb;
 double MoreReliableStarRating;
 void DoV_BFlag();
 void DoLevelOfHelpfv();
 void DoLevelOfRevb();
 void DoMoreReliableStarRating(int kindofgoods);
};
//

DATE 类型的数据格式
DATE.h

#pragma once
class DATE
{
public:
 int year;
 int month;
 int day;
};

读取数据

dispose.cpp

#include"FEADDATA.h"
#include<iostream>
#include<list> //链表
#include<fstream>   
#include<cstdlib>
#include <iomanip>
using namespace std;

int main()
{
 ifstream infill;        //定义文件输入流对象
 string FilePath =
  "F:\\Desktop_F\\2020\\2020_Weekend2_Problems\\Problem_C_Data\\pacifier.tsv";//数据文件的路径
 infill.open(FilePath);
 string buff;           //read  a single line first
 if (infill.fail())     //判断文件文件是否打开成功
 {
  cerr << "open file error！" << endl;
  return -1;
 }
 int eachlen[15];
 string P = " ";        //set the character("Tab") to look for  
 int begin = -1;        // location of P
 int count = 0;         //set a counter
 int loca[14] = { 0 };  //用来存放每行数据的“Tab”的位置，题目中给出的数据每行有15个数据，因此定义一个大小为14的数组
 
list<FEADDATA> DataList;  //定义链表，用来存储每一行数据
 FEADDATA tmp_FEADDATA;    
 string change;
while (getline(infill, buff))    //while循环进行逐行读入
 { 
  begin = -1;  
  count = 0;
  while ((begin = buff.find(P, begin + 1)) != string::npos)  //计算每行数据的“Tab”的位置并存入数组
  {
   loca[count] = begin;
   count++;
   begin = begin + P.length();
  }
  eachlen[0] = loca[0];
  for (int i = 1; i != 14; i++)  //计算一条数据中每项数据的字符个数，并存入eachlen数组           
  {
   eachlen[i] = loca[i] - loca[i - 1] - 1;
  }
  eachlen[14] = buff.length()-loca[13];
  //根据每一项数据的起始位置和数据长度读取数据
  tmp_FEADDATA.marketplace = buff.substr(0, eachlen[0]);
  tmp_FEADDATA.customer_id = buff.substr(loca[0]+1, eachlen[1]);
  tmp_FEADDATA.review_id = buff.substr(loca[1] + 1, eachlen[2]);
  tmp_FEADDATA.product_id = buff.substr(loca[2] + 1, eachlen[3]);
  tmp_FEADDATA.product_parent = buff.substr(loca[3] + 1, eachlen[4]);
  tmp_FEADDATA.product_title = buff.substr(loca[4] + 1, eachlen[5]);
  tmp_FEADDATA.product_category = buff.substr(loca[5] + 1, eachlen[6]);
     change=buff.substr(loca[6] + 1, eachlen[7]);        //将string类型转为int类型
  tmp_FEADDATA.star_rating = atoi(change.c_str());
  change=buff.substr(loca[7] + 1, eachlen[8]);
  tmp_FEADDATA.helpful_votes = atoi(change.c_str());
  change=buff.substr(loca[8] + 1, eachlen[9]);
  tmp_FEADDATA.total_votes = atoi(change.c_str());
  tmp_FEADDATA.vine = buff.substr(loca[9] + 1, eachlen[10]);
  tmp_FEADDATA.verified_purchase = buff.substr(loca[10] + 1, eachlen[11]);
  tmp_FEADDATA.review_headline = buff.substr(loca[11] + 1, eachlen[12]);
  tmp_FEADDATA.review_body = buff.substr(loca[12] + 1, eachlen[13]);
  tmp_FEADDATA.review_date_ori = buff.substr(loca[13] + 1, eachlen[14]);
  DataList.push_back(tmp_FEADDATA);    //将 tmp_FEADDATA中的数据向链表中尾插
 }
 //while循环结束，数据基本读取完毕
 //下面是对时间数据类型转换的操作
 string strdate;
 string stryear, strmonth, strday;
 int firstline;
 for (auto& it : DataList)
 {
  strdate = it.review_date_ori;
  if ((firstline = strdate.find('/')) == string::npos)
  {
   cerr << "string of time error!" << endl;
   return -1;
  }
  stryear = strdate.substr(strdate.length() - 4, 4);
  it.X_review_date.year = atoi(stryear.c_str());
  strmonth = strdate.substr(0, firstline);
  it.X_review_date.month = atoi(strmonth.c_str());
  if (strmonth.length() != 2)
  {
   strmonth = '0' + strmonth;
  }
  strday = strdate.substr(firstline + 1, strdate.length()-6 -firstline);
  it.X_review_date.day = atoi(strday.c_str());
  if (strday.length() != 2)
  {
   strday = '0' + strday;
  }
  //连接
  strdate = stryear + strmonth;
  strdate = strdate + strday;
  it.review_date = atoi(strdate.c_str()); 
 }
 infill.close();   
return 0;
}