import time
import random
import requests
import csv
from time import sleep
from fake_useragent import UserAgent
from lxml import etree
(关键部分代码如下)
1、获取首页源码
def get_fund(url):
sleep(random.uniform(1,2))
headers = {'User-Agent':UserAgent(verify_ssl=False).random}
response = requests.get(url, headers=headers,timeout=10)
return response.text
2、爬取数据
def parse_fund(html):
parse = etree.HTML(html)
items = parse.xpath('//*[@id="articlelistnew"]/div')[1:]
for item in items:
item = {
'阅读': ''.join(item.xpath('./span[1]/text()')).strip(),
'评论': ''.join(item.xpath('./span[2]/text()')).strip(),
'标题': ''.join(ite