import redis
import requests
from lxml import etree
def get_xpath_by_requests(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
response = requests.get(url,headers=headers)
if response.status_code==200:
# print(response.text)
return etree.HTML(response.text)
def get_text(text):
if text:
return text[0]
return ''
def parse_caixi(url):
re = redis.Redis()
html = get_xpath_by_requests(url)
infos = get_text(html.xpath('//span[@class="gopage"]/form/text()'))
# print(infos,url)
info = infos[1:-5]
# print(max_page)
if info:
max_page = info
else:
max_page = 1
# print(max_page)
for i in range(1,int(max_page)+1):
page_url = url+'?&page={}'.format(i)
print(page_url)
re.lpush('picture_caixi:start_urls',page_url)
def main():
base_url = 'https://www.meishij.net/china-food/caixi/'
html = get_xpath_by_requests(base_url)
caixi_list = html.xpath('//dl[@class="listnav_dl_style1 w990 clearfix"]/dd//a/@href')
# print(caixi_list)
for url in caixi_list:
parse_caixi(url)
if __name__ == '__main__':
main()
运行结果报错