刚开始学习爬虫,利用requests抓取数据
以爬取QQ情侣签名为例
import requests
from lxml.html import etree
for page in range(1,500):
url = 'http://www.qzone.cc/qianming/qinglv/list_%s.html'%page # 爬取页面的url
response = requests.get(url) # 请求url
response.encoding = 'utf-8'
try: # 异常捕获
html = etree.HTML(response.content)
content = html.xpath("//*[@id='refreshDiv']/dl/dd/div/p/text()") # 利用xpath匹配标签中的内容
with open('情侣.txt','a+',encoding='utf-8') as f: # 写入数据
for con in content:
try:
f.write(con)
f.write('\n')
print('内容写入成功%s'%con )
time.sleep(1) # 避免访问频繁防止ip被封,使用time.sleep()降低访问的频率
except Exception as e:
print('内容写入失败')
except Exception as e:
print("获取页面 %s 失败"%page)
上面用到了xpath,具体功能由于时间原因自行百度,本人使用的谷歌浏览器,用的是xpath helper这个插件