在学了urllib和Fiddler抓包爬虫之后,最大的感触就是费时费力效果也不好,偶然在一篇文章中发现xpath基于开发者工具爬虫并对相同标签进行索引,解决了同一个标题却有多个相同标签的问题,这是写入数据库的效果
爬虫代码
爬虫过程中要注意的问题都在代码中有所注释
import requests
from lxml import etree
import pymysql
def get_url(url):
url =url
#防屏蔽代理服务器
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
html =requests.get(url,headers=headers)
#将html转换为xpath可以匹配
selector =etree.HTML(html.text)
#豆瓣电影中有中英文标题,我索引第一个中文标题
names =selector.xpath('//span[1][@class ="title"]/text()')
socres =selector.xpath('//span[@class ="rating_num"]/text()')
##评价人数span标签没有属性,可以直接索引到第五个span标签
evluates =selector.xpath('//span[4]/text()')
summarys =selector.xpath('//span[@class ="inq"]/text()')
links =selector.xpath('//a[@class =""]/@href')
# 捕捉异常,防止爬取异常终止
try:
for i in range(0,len(evluates)):
name =names[i]
socre =socres[i]
evluate =evluates[i]
# print(actor[i])
summary =summarys[i]
link =links[i]
# 写进数据库
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='root', db='dangdang')
sql = "insert into douban(name,socre,evluate,summary,link) values ('" + name + "','" + socre + "','" + evluate + "','" + summary + "','" + link + "')"
conn.query(sql)
conn.close()
except Exception as e:
print(e)
#发现网址规律,然后遍历所有网址
for i in range(0,10):
url ='https://movie.douban.com/top250?start=' + str((i-1)*25) +'&filter='
get_url(url)
# get_url("https://movie.douban.com/top250?start=0&filter=")