1.安装python(2和3不兼容)
2.以管理员身份运行终端,下载beautifulsoup4,执行命令:pip install beautifulsoup4
3.下载request :pip install requests
4.分析网页代码结构:
简单代码如下:
import requests
from bs4 import BeautifulSoup
resp = requests.get("网址")
soup = BeautifulSoup(resp.text,'html.parser')
#分析网页,找到相应类和标签
title = soup.find('ul',class_='detaila').text.strip()
content = soup.find('ul',class_='detailc').text.strip()
file_name = '{}.txt'.format(title)
with open (file_name,'w',newline = '') as f:
f.write(content)
开多线程爬取飞华网实例
import re
import sys
import time
import requests
import threading
from urllib import parse
from bs4 import BeautifulSoup
ori_url = 'http://dise.fh21.com.cn/department/illnesses.html'
session = requests.session()
root_urls = [] # 所有科室的绝对路径
tag_urls = [] # 所有的绝对路径
times = 16
def main():
soup = request_get(ori_url)
for root in soup.find_all('ul', class_='level2'):
for tag in root.find_all('a', class_='link08 '):
root_urls.append(parse.urljoin(ori_url, tag['href']))
for url in root_urls:
soup = request_get(url)
if soup is 'pass':
#print('Skip this one url above.', file=sys.stderr)
continue
list_root = soup.find('div', class_='dise_list')
for a in list_root.find_all('a', class_='link08'):
target = a.get('href')
tag_urls.append(target)
page_tab = soup.find('div', class_='pageStyle')
if page_tab:
next_page = page_tab.find('span', class_='current').next_sibling
if next_page:
soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
scrape_list_page(soup)
#print('A total of {} urls were scraped.'.format(len(tag_urls)), file=sys.stderr)
#print('-------- Start saving... --------', file=sys.stderr)
count = 0
temp = len(tag_urls) // times
#print(temp)
#print(type(temp))
threads = []
while count < times:
t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):(temp * (count + 1))],))
threads.append(t)
count += 1
if (temp * count) < len(tag_urls):
t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):len(tag_urls)]))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
for url in tag_urls:
soup = request_get(url)
detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
save_txt(request_get(detail_url))
tag_urls.clear()
root_urls.clear()
#print('All completed.', file=sys.stderr)
def request_get(url):
resp = session.get(url)
#print(url)
if resp.status_code is not 200:
#print('404', file=sys.stderr)
return 'pass'
return BeautifulSoup(resp.text, 'lxml')
def scrape_list_page(soup):
for a in BeautifulSoup(str(list(soup.select('.dise_list_title')[1].next_siblings)), 'html.parser').select('.link08'):
target = a.get('href')
tag_urls.append(target)
page_tab = soup.find('div', class_='pageStyle')
if page_tab:
next_page = page_tab.find('span', class_='current').next_sibling
if next_page:
soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
scrape_list_page(soup)
def process_task(targets):
for url in targets:
time.sleep(1)
soup = request_get(url)
detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
save_txt(request_get(detail_url))
def save_txt(soup):
title = re.sub(r'[\\/\:\*\?"\<\>\|]', '@', (soup.find('div', class_='navigator').find_all('a', class_='link04')[2].text)).strip()
content = soup.find('ul', class_='detailc').text.strip()
file_name = '{}.txt'.format(title)
with open(file_name, 'w', encoding='utf-8', newline='') as f:
f.write(content)
if __name__ == '__main__':
main()