1.关于.tsv文件
TSV:tab separated values;即“制表符分隔值”,即每行的不同数据之间是使用"Tab"进行分割的。如:
姓名 年龄 身高 ...
小红 10 166 ...
小绿 15 175 ...
小花 20 164 ...
......
C题中的数据也是这种格式
2.关于题目中给出的数据
题目中给出的数据好像是直接从网站上爬下来的(在某些数据中发现了如 HTML中换行符 之类的字符),有的数据行中含有一些特殊的字符会影响程序的读入,由于数量极少,直接手动剔除了,没有写相关的程序来实现无效数据的剔除。
3.根据给出的数据设计相应的类
FEADDATA.h
#pragma once
#include<string>
#include"DATE.h"
class FEADDATA
{
//由于时间紧张,我直接把这些数据权限都设置为了public ,也不用再写get,set 之类的函数了,haha@~_~@
public:
std::string marketplace; //2 letter country code of the marketplace where the review was written.
std::string customer_id; //Random identifier that can be used to aggregate reviews
//written by a single author.
std::string review_id; //The unique ID of the review.
std::string product_id; //The unique Product ID the review pertains to.
std::string product_parent; //Random identifier that can be used to aggregate reviews for
//the same product.
std::string product_title; //Title of the product.
std::string product_category; //The major consumer category for the product.
int star_rating; //The 1-5 star rating of the review.
int helpful_votes; //Number of helpful votes.
int total_votes; //Number of total votes the review received.
std::string vine;
std::string verified_purchase;// A “Y” indicates Amazon verified that the person writing the
//review purchased the product at Amazon and didn't receive the
//product at a deep discount.
std::string review_headline; //he title of the review.
std::string review_body; //The review text.
int LengthofReview_body; //length of review text.
int review_date; //The date the review was written.
DATE X_review_date; //The date the review was written.
std::string review_date_ori; //The date the review was written.
//此处的时间类型我定义了三种,一种是string类型的,一种是int类型的,还有一种是自定义的DATE 类型的(下面会给出)
public:
//第二部分 这是问题具体实现过程中的一些函数,和本文数据读取无关
int V_BFlag;
int LevelOfHelpfv;
int LevelOfRevb;
double MoreReliableStarRating;
void DoV_BFlag();
void DoLevelOfHelpfv();
void DoLevelOfRevb();
void DoMoreReliableStarRating(int kindofgoods);
};
//
DATE 类型的数据格式
DATE.h
#pragma once
class DATE
{
public:
int year;
int month;
int day;
};
读取数据
dispose.cpp
#include"FEADDATA.h"
#include<iostream>
#include<list> //链表
#include<fstream>
#include<cstdlib>
#include <iomanip>
using namespace std;
int main()
{
ifstream infill; //定义文件输入流对象
string FilePath =
"F:\\Desktop_F\\2020\\2020_Weekend2_Problems\\Problem_C_Data\\pacifier.tsv";//数据文件的路径
infill.open(FilePath);
string buff; //read a single line first
if (infill.fail()) //判断文件文件是否打开成功
{
cerr << "open file error!" << endl;
return -1;
}
int eachlen[15];
string P = " "; //set the character("Tab") to look for
int begin = -1; // location of P
int count = 0; //set a counter
int loca[14] = { 0 }; //用来存放每行数据的“Tab”的位置,题目中给出的数据每行有15个数据,因此定义一个大小为14的数组
list<FEADDATA> DataList; //定义链表,用来存储每一行数据
FEADDATA tmp_FEADDATA;
string change;
while (getline(infill, buff)) //while循环进行逐行读入
{
begin = -1;
count = 0;
while ((begin = buff.find(P, begin + 1)) != string::npos) //计算每行数据的“Tab”的位置并存入数组
{
loca[count] = begin;
count++;
begin = begin + P.length();
}
eachlen[0] = loca[0];
for (int i = 1; i != 14; i++) //计算一条数据中每项数据的字符个数,并存入eachlen数组
{
eachlen[i] = loca[i] - loca[i - 1] - 1;
}
eachlen[14] = buff.length()-loca[13];
//根据每一项数据的起始位置和数据长度读取数据
tmp_FEADDATA.marketplace = buff.substr(0, eachlen[0]);
tmp_FEADDATA.customer_id = buff.substr(loca[0]+1, eachlen[1]);
tmp_FEADDATA.review_id = buff.substr(loca[1] + 1, eachlen[2]);
tmp_FEADDATA.product_id = buff.substr(loca[2] + 1, eachlen[3]);
tmp_FEADDATA.product_parent = buff.substr(loca[3] + 1, eachlen[4]);
tmp_FEADDATA.product_title = buff.substr(loca[4] + 1, eachlen[5]);
tmp_FEADDATA.product_category = buff.substr(loca[5] + 1, eachlen[6]);
change=buff.substr(loca[6] + 1, eachlen[7]); //将string类型转为int类型
tmp_FEADDATA.star_rating = atoi(change.c_str());
change=buff.substr(loca[7] + 1, eachlen[8]);
tmp_FEADDATA.helpful_votes = atoi(change.c_str());
change=buff.substr(loca[8] + 1, eachlen[9]);
tmp_FEADDATA.total_votes = atoi(change.c_str());
tmp_FEADDATA.vine = buff.substr(loca[9] + 1, eachlen[10]);
tmp_FEADDATA.verified_purchase = buff.substr(loca[10] + 1, eachlen[11]);
tmp_FEADDATA.review_headline = buff.substr(loca[11] + 1, eachlen[12]);
tmp_FEADDATA.review_body = buff.substr(loca[12] + 1, eachlen[13]);
tmp_FEADDATA.review_date_ori = buff.substr(loca[13] + 1, eachlen[14]);
DataList.push_back(tmp_FEADDATA); //将 tmp_FEADDATA中的数据向链表中尾插
}
//while循环结束,数据基本读取完毕
//下面是对时间数据类型转换的操作
string strdate;
string stryear, strmonth, strday;
int firstline;
for (auto& it : DataList)
{
strdate = it.review_date_ori;
if ((firstline = strdate.find('/')) == string::npos)
{
cerr << "string of time error!" << endl;
return -1;
}
stryear = strdate.substr(strdate.length() - 4, 4);
it.X_review_date.year = atoi(stryear.c_str());
strmonth = strdate.substr(0, firstline);
it.X_review_date.month = atoi(strmonth.c_str());
if (strmonth.length() != 2)
{
strmonth = '0' + strmonth;
}
strday = strdate.substr(firstline + 1, strdate.length()-6 -firstline);
it.X_review_date.day = atoi(strday.c_str());
if (strday.length() != 2)
{
strday = '0' + strday;
}
//连接
strdate = stryear + strmonth;
strdate = strdate + strday;
it.review_date = atoi(strdate.c_str());
}
infill.close();
return 0;
}
结束
本文仅作为学习交流使用,欢迎指正!!!
祝大家比赛都能取得好成绩