)\d*\.?\d*?(?=)';
HousePrice=string(regexp(source,HousePrice,'match')');
HousePrice(haha)=[];
Average(haha)=[];
同样用正则表达式提取出房价等信息,最后将数据进行整理准备写入
SS=[SS ;Site];
HT=[HT ;HouseType];
HA=[HA; HouseArea];
De=[De; Decoration];
HP=[HP; HousePrice] ;
Av=[ Av ;Average];
DB=[DB ;Describe];
Data=[ string('地址') string('房型') string('大小') string('装修') string('总价/万') string('单价/万');SS HT HA De HP Av;];
xlswrite('Data.xls',Data);
4. 下面是完整代码
Type='l'; %表示选中的房型
Area='a'; %表示选中的面积
Price='p'; % 表示选中的价格
% 顺序为 pg l a p,后面需要斜杠
SS=[];HT=[];HA=[];HP=[];De=[];Av=[];DB=[]; % 用于保存得到的数据
p=1:6
for l=1:6
for a=1:6
Website=['https://gz.lianjia.com/ershoufang/' 'pg1' Type num2str(l) Area num2str(a) Price num2str(p)];
[source, state]=urlread(Website);
if ~state
continue;
end
GetTotalPage='(?<="page-data=''{"totalPage":).*?(?=,"curPage)';
TotalPage=str2double(regexp(source,GetTotalPage,'match'));
if isempty(TotalPage)
continue;
end
for pg=1:TotalPage
Website=['https://gz.lianjia.com/ershoufang/' 'pg' num2str(pg) Type num2str(l) Area num2str(a) Price num2str(p)];
[source state]=urlread(Website);
if ~state
disp('爬取网页出错,当前序号:');
l
a
p
end
Site=string(regexp(source,'(?<=
).*?(?=
)','match')');
% aa 为数据集
try
aa=regexp(Site,'/','split');
haha=cell2mat(cellfun(@size ,aa,'UniformOutput',false));
haha=find(haha(:,2)~=5);
aa(haha)=[];
catch
continue;
end
try
aa=reshape([aa{:}],5,length(aa))';
catch
continue
end
Describe=string(regexp(source,'(?<=" data-is_focus="" data-el="ershoufang">).*?(?=
)','match')');
Describe=(regexp(Describe,'(?<=data-is_focus="" data-el="ershoufang">).*?','split'));
Describe=reshape([Describe{:}],2,length([Describe{:}])/2)';
Describe=Describe(:,2);
Describe(haha)=[]; % 简介,标题
Site=aa(:,1);
HouseType=string(aa(:,2));
HouseArea=string(aa(:,3));
Decoration=string(aa(:,5));
clear aa;
Average='(?<=data-price=")\d*\.?\d*?(?=">)';
Average=string(regexp(source,Average,'match')');
HousePrice='(?<=
)\d*\.?\d*?(?=)';
HousePrice=string(regexp(source,HousePrice,'match')');
HousePrice(haha)=[];
Average(haha)=[];
SS=[SS ;Site];
HT=[HT ;HouseType];
HA=[HA; HouseArea];
De=[De; Decoration];
HP=[HP; HousePrice] ;
Av=[ Av ;Average];
DB=[DB ;Describe];
end
end
end
end
Data=[ string('地址') string('房型') string('大小') string('装修') string('总价/万') string('单价/万');SS HT HA De HP Av;];
xlswrite('Data.xls',Data);
结果如下:
如果需要得到更美观的Excel表格请看matlab对Excel进行控制