import requests
import re
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = 'https://xxxxxxxxxxx/wssb/websearch/SearchCardAction.do?operate=searchGyEntCard&operPage=card_hzpscxkz_list&cardtype=100'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
}
response = requests.get(url=url,headers=headers).text
#获取ID
id_list = []
soup = BeautifulSoup(response,'html.parser')
business_list = soup.find_all(name='a',attrs={'href': re.compile(r'[(](.*)[)]')})
for content in business_list:
id_element = re.findall(r'[(](.*)[)]',str(content))
for id in id_element:
id_list.append(id.strip("'"))
#根据ID构建二次请求,获取商家详细信息
content_list = []
url_business = 'https://xxxxxxxxxxxxxxx/wssb/websearch/SearchCardAction.do?operate=viewGyEntCard&operPage=card_hzpscxkz_view&recid='
for id in id_list:
url_full = url_business + id
#发起请求
content_list.append(requests.get(url=url_full,headers=headers).text.encode())
#持久化存储
for text in content_list:
with open('./content.txt','a',encoding='utf-8') as fs:
fs.write(str(text))
print("over")