python爬取cnblogs页面信息
1.导入相应的类库
import requests as rs
import re
import xlsxwriter
from lxml import etree #导入lxml库的etree模块
2.提取url串
url=“https://www.cnblogs.com/”
str1=‘https://www.cnblogs.com/#p’
urllist=[‘https://www.cnblogs.com/’]
for i in range(2,200):
urltemp=str1+str(i)
urllist.append(urltemp)
3.模拟浏览器
header={
“User-Agent”: “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11”
}#申报身份 字典类型,是键值对
htmlstr=""
4.提取html
for url in urllist:
req=rs.get(url,headers=header)
req.encoding=“utf-8”
print(req.text)
htmlstr+=req.text
##print(htmlstr)
html=etree.HTML(htmlstr)
4.提取相应内容
##大标题
result1=html.xpath(’//h3//text()’)
##print(result1[0])
##print(result1)
##内容1,不完善,第一个标题对应的是第二个内容
##result2=html.xpath(’//div[@class=“post_item”]/div[@class=“post_item_body”]/p[@class=“post_item_summary”]//text()’)
##print(result2[1])
##内容2(正则表达式)较为完善
pat1=‘alt=""/>(.*)…’
pattern1=re.compile(pat1)
result2=pattern1.findall(htmlstr)
##print(result2[0])
##print(len(result2))
##发布人
result3=html.xpath(’//div[@class=“post_item_foot”]/a[@class=“lightblue”]/text()’)
##print(result3)
##print(len(result3))
##评论数量1
##result4=html.xpath(’//div[@class=“post_item_foot”]/span[@class=“article_comment”]//text()’)
##print(result4)
##评论数量2(正则表达式)
pat2=‘评论((.{1,3}))’
pattern2=re.compile(pat2)
result4=pattern2.findall(htmlstr)
##print(result4)
##阅读量1
##result5=html.xpath(’//div[@class=“post_item_foot”]/span[@class=“article_view”]//text()’)
##print(result5)
##阅读量2(正则表达式)
pat3=‘阅读((.{1,3}))’
pattern3=re.compile(pat3)
result5=pattern3.findall(htmlstr)
##print(result5)
##发布时间
pat4=‘发布于\s(.{16})’
pattern4=re.compile(pat4)
result6=pattern4.findall(htmlstr)
##print(result6)
##推荐数
result7=html.xpath(’//div[@class=“diggit”]/span[@class=“diggnum”]/text()’)
##print(result7)
5.整合起来,放在新的列表中
resultlist=[]
for i in range(0,len(result2)):
resultlist.append(result1[i]+" “+result2[i]+” “+result3[i]+” “+result4[i]+” “+result5[i]+” “+result6[i]+” "+result7[i])
##for i in range(1,5):
##print(resultlist[i])
6.写入txt文件中
##f=open(r"博客.txt",“w”)
##for i in range(len(resultlist)):
##f.write(resultlist[i]+"\n")
##f.close()