原题:
代码
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
from time import sleep
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36 Edg/81.0.416.68'
}
def get_text(url):
res = requests.get(url = url,headers = headers)
# print(res.text)
reg = re.compile('"docurl":"(.*?.html)"',re.S)
finall1 = re.findall(reg,res