爬取某荒小说网站,该网站小说广告比较少,易于尝试爬取,请不要过分爬取,以免造成网站崩溃
import requests
from fake_useragent import UserAgent
from lxml import etree
import os,time
def get_html(url):
ua = UserAgent()
headers = {
'User-Agent':ua.random}
# print(headers)
response = requests.get(url, headers=headers)
response.encoding = 'gzip'
return response
def be_tree(url):
response = get_html(url)
tree = etree.HTML(response.text)
return tree
def get_novelinfos(url):
tree = be_tree(url)
titles = tree.xpath('/html/body/div[1]/div[1]/div/ul/li/div/a/div[2]/h4/text()')
true_urls = tree.xpath('/html/body/div[1]/div[1]/div/ul/li/div/a/@href'