代码在CentOS 7上运行通过,Windows下无法运行,统计规则如下:
- 形如to-night用一个连字符连起来的单词记为一个单词
- 在行尾用连字符连起来的单词算一个单词,例如:
xxxxxxxxxxxxxxxxxxx beauti-
ful
beautiful算一个单词
- 多个连字符相连的视为多个单词,例如it---is---a---dog,视为it is a dog共计为4个单词
- 文件路径名不可超过1024个字符
- 使用-h参数查看使用帮助
下面是代码:
#include <ctype.h>
#include <unistd.h>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <map>
#include <string>
#include <vector>
std::map<const std::string, int> data;
std::vector<std::pair<std::string, int> > outputlist;
#define PATH_LEN 1024
#define BUFF_SIZE 1024
char path[PATH_LEN];
int output_mode = 0;
static void show_help_message() {
printf(
"Usage:\n-s 文件名,\n-m 输出格式,0表示文件 1表示命令行\n"
"-h 显示帮助文档\n\n文件名不能超过1024个字符\n");
}
static int init_arguments(int* argc, char*** argv) {
const char* regex = "s:m:h";
int opt;
while ((opt = getopt(*argc, *argv, regex)) != -1) {
switch (opt) {
case 's':
strcpy(path, optarg);
break;
case 'm':
output_mode = atoi(optarg);
break;
case 'h':
show_help_message();
break;
default:
fprintf(stderr, "Please use -h to get help infomation\n");
return -1;
}
}
if (path == NULL || strlen(path) == 0) {
fprintf(stderr, "file name is not given!\n");
return -1;
}
return 0;
}
char segment[BUFF_SIZE * 2];
int pre_segment_len = 0;
static void process_data(const char* words, const int len) {
memcpy(segment + pre_segment_len, words, len);
//求后缀长度
int p = len + pre_segment_len - 1;
int next_pre_segment_len = 0;
while (isalpha(segment[p]) ||
(segment[p] == '-' && isalpha(segment[p - 1]))) {
p--;
next_pre_segment_len++;
}
int head = 0, end = 0;
while (end < p) {
while (!isalpha(segment[head]) && head < p) {
head++;
}
if (head >= p)
break;
end = head;
char word[BUFF_SIZE];
int word_len = 0;
memset(word, 0, sizeof(word));
while (1) {
if (isalpha(segment[end])) {
word[word_len] = tolower(segment[end]); //忽略大小写
word_len++;
} else if (segment[end] == '-') { //考虑连字符
// 换行连字符
if (segment[end + 1] != '\r' && segment[end + 1] != '\n' &&
!isalpha(segment[end + 1])) {
break;
} else {
if (segment[end + 1] == '\r' || segment[end + 1] == '\n') {
while (segment[end + 1] == '\r' ||
segment[end + 1] == '\n')
end++;
end--;
}
if (isalpha(segment[end + 1])) {
word[word_len] = '-';
word_len++;
}
}
} else { //不是字母/连字符
break;
}
end++;
}
word[word_len] = '\0';
data[word]++;
head = end;
}
memmove(segment, segment + p + 1, next_pre_segment_len);
pre_segment_len = next_pre_segment_len;
}
static int calculate() {
// freopen(path, "r", stdin);
FILE* fp = fopen(path, "r");
if (fp == NULL) {
fprintf(stderr, "文件名无效\n");
return -1;
}
char buff[BUFF_SIZE];
bool exit = false;
while (!exit) {
int ret = fread(buff, 1, BUFF_SIZE, fp);
if (ret < BUFF_SIZE && feof(fp)) {
buff[ret] = '\n';
exit = true;
}
process_data(buff, ret);
}
}
bool cmp(const std::pair<const std::string, const int>& a,
const std::pair<const std::string, const int>& b) {
return a.second > b.second;
}
int main(int argc, char** argv) {
//debug调试用
#ifndef _DEBUG
if (init_arguments(&argc, &argv) == -1)
return -1;
printf("path:%s\nmode:%d\n", path, output_mode);
#else
strcpy(path, "/home/chuancey/homework/doc/100M.txt");
output_mode = 1;
puts("DEBUG!\n");
#endif
if (calculate() == -1)
return -1;
if (pre_segment_len > 0) { //剩余的字符
segment[pre_segment_len] = '\0';
//忽略大小写
for (int i = 0; i < pre_segment_len; i++) {
segment[i] = tolower(segment[i]);
}
data[segment]++;
}
std::map<const std::string, int>::iterator it = data.begin();
if (output_mode) {
freopen("output.txt", "w", stdout);
}
printf("wods number: %d\n", data.size());
for (; it != data.end(); it++) {
// printf("%s:%d\n", it->first.c_str(), it->second);
const std::string key = it->first;
int value = it->second;
outputlist.push_back(std::make_pair(key, value));
}
std::sort(outputlist.begin(), outputlist.end(), cmp);
for (int i = 0; i < outputlist.size(); i++) {
printf("%s:%d\n", outputlist[i].first.c_str(), outputlist[i].second);
}
return 0;
}