from bs4 import BeautifulSoup
import requests
def one(url):
r = requests.get(url, allow_redirects = False)
fin = r.text
#fin是上面的变量名?soup.find里是按级别找的
soup = BeautifulSoup(fin,'html.parser')
comments = soup.find('div','hos-guide-box')
comments_list = comments.find_all(['p','h2','h3','h4'])
result = []
for i in range(len(comments_list)):
result.append(comments_list[i].text.strip())
result1 = " ".join(result)
return result1
XHR过滤掉不相关信息
def main():
#这个网址打不开
r = requests.post('http://wapjbk.39.net/DiseaseArea/SpeciesCate', data={'id':4,'cateId':77}).json()
list1 = []
file = []
#list1是网址,file是症状名
for i in r:
list1.append(i.split('~')[0])
file.append(i.split('~')[1])
for i in range(len(list1)):
url = 'http://wapjbk.39.net/'+list1[i]+'/zztz/'
result1 = one(url)
file1 ="D://dabao//爬虫练习//泌尿系统//" + file[i] + ".txt"
f = open(file1,'a',encoding='utf-8')
f.write(result1)
if __name__ == '__main__':
main()