周日做了信息论的小project,差不读熬了一个晚上加周一的早上,终于利用matlab成功的读取了txt文件中的英文单词以及简单的数据处理,现在进行简单的分享。
百度经验:matlab如何读取txt文件: https://jingyan.baidu.com/article/b87fe19e6b478852183568e1.html
代码:
- function [] = work3()
- clc
- clear all
- close all
- %% read data
- ch = fileread('harry1.txt');
- ch = strrep(ch,',',' '); %%将逗号换成空格
- ch = lower(ch); %%排成一行
- ch = reshape(strsplit(ch),[],1); %%将cell类型的数据转换成char,每个单词一行。
- %% 1-gram
- gram1=ch;
- [words,~,idx] = unique(char(gram1),'rows');
- numOccurrences = histcounts(idx,length(words));
- numOccurrences =sort(numOccurrences );
- [err1,H1]=errH(numOccurrences);
- %% 2-gram
- clear words idx numOccurrences
- gram2=char(ch);
- [gram2_row,~]=size(gram2);
- for i=1:gram2_row/2
- gram2_reshap(i,:)=[gram2(2*i-1,:) gram2(2*i,:)];
- end
- [words,~,idx] = unique(gram2_reshap,'rows');
- [word_row,~]=size(words);
- numOccurrences = histcounts(idx,word_row);
- numOccurrences =sort(numOccurrences );
- [err2,H2]=errH(numOccurrences);
- %% 3-gram
- clear words idx numOccurrences
- gram3=char(ch);
- [gram3_row,~]=size(gram3);
- for i=1:gram3_row/3
- gram3_reshap(i,:)=[gram3(3*i-2,:) gram3(3*i-1,:) gram3(3*i,:)];
- end
- [words,~,idx] = unique(gram3_reshap,'rows');
- [word_row,~]=size(words);
- numOccurrences = histcounts(idx,word_row);
- numOccurrences =sort(numOccurrences );
- [err3,H3]=errH(numOccurrences);
- figure
- stairs(err1,H1/H1(1),'r')
- titleName = ['N=',num2str(H1(1))];
- hold on
- stairs(err2,H2/H2(1),'b')
- hold on
- stairs(err3,H3/H3(1),'k')
- title(titleName,'fontsize',16,'fontweight','bold');
- xlabel('误差','fontsize',16,'fontweight','bold');
- ylabel('H/N','fontsize',16,'fontweight','bold');
- legend('1-gram','2-gram','3-gram');
- end
- %%
- %% 纠错函数
- function [err,H]=errH(numOccurrences)
- x_remain=sum(numOccurrences);
- p_num=numOccurrences /sum(numOccurrences);
- num=1;
- err(1)=0;
- H(1)=log2(x_remain);
- for r=1:length(numOccurrences)
- % hwait=waitbar(num/sum(numOccurrences),'请等待>>>>>>>>');
- for n=1:numOccurrences(r)
- num=num+1;
- x_remain=x_remain-1;
- err(num)=err(num-1)+p_num(r)/numOccurrences(r);
- H(num)=log2(x_remain);
- end
- end
- end