1. 引入模块和函数
from requests_html import HTMLSession
from urllib.parse import urlparse, parse_qs
import pprint
import time,datetime
import pandas as pd
from random import random
2.页面爬取代码
def sogou_weixin(url,params):
r = session.get(url, params = payload)
# 先取特定元素, 精准打击其子后辈
主要元素 =r.html.xpath('//div[@class="news-box"]/ul[@class="news-list"]/li')
dict_xpath={
'text_content':{
'标题':'//div[@class="txt-box"]/h3/a',
'内容':'//div[@class="txt-box"]/p',
# '公众号名称':'//div[@class="txt-box"]/div[@class="s-p"]/a'
},
'href':{
'文章链接':'//div[@class="txt-box"]/h3/a',