使用requests+lxml爬取网页
import requests
from lxml import etree
url = 'https://movie.douban.com/subject/1292052/'
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15"
}
r = requests.get(url,headers=headers)
tree = etree.HTML(r.text)
el = tree.xpath('//*[@id="content"]/h1/span[1]/text()')
print(el)
使用ruia框架爬取网页
import asyncio
from ruia import Item, TextField
class DoubanItem(Item):
"""
定义爬虫的目标字段
"""
title = TextField(css_select='#content > h1 > span:nth-child(1)')
async_func = DoubanItem.get_item(url="https://movie.douban.com/subject/1292052/")
item = asyncio.get_event_loop().run_until_complete(async_func)
print(item.title)
requests+lxml 爬取代码量多
ruia 简洁