网络爬虫

1.安装python(2和3不兼容)
2.以管理员身份运行终端,下载beautifulsoup4,执行命令:pip install beautifulsoup4
3.下载request :pip install requests
4.分析网页代码结构:
简单代码如下:

import requests
from bs4 import BeautifulSoup

resp = requests.get("网址")
soup = BeautifulSoup(resp.text,'html.parser')
#分析网页,找到相应类和标签
title = soup.find('ul',class_='detaila').text.strip()
content = soup.find('ul',class_='detailc').text.strip()

file_name = '{}.txt'.format(title)
with open (file_name,'w',newline = '') as f:
    f.write(content)

开多线程爬取飞华网实例

import re
import sys
import time
import requests
import threading
from urllib import parse
from bs4 import BeautifulSoup


ori_url = 'http://dise.fh21.com.cn/department/illnesses.html'
session = requests.session()
root_urls = []  # 所有科室的绝对路径
tag_urls = []  # 所有的绝对路径
times = 16


def main():
    soup = request_get(ori_url)
    for root in soup.find_all('ul', class_='level2'):
        for tag in root.find_all('a', class_='link08 '):
            root_urls.append(parse.urljoin(ori_url, tag['href']))

    for url in root_urls:
        soup = request_get(url)
        if soup is 'pass':
            #print('Skip this one url above.', file=sys.stderr)
            continue
        list_root = soup.find('div', class_='dise_list')
        for a in list_root.find_all('a', class_='link08'):
            target = a.get('href')
            tag_urls.append(target)
        page_tab = soup.find('div', class_='pageStyle')
        if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)
    #print('A total of {} urls were scraped.'.format(len(tag_urls)), file=sys.stderr)
    #print('--------    Start saving...    --------', file=sys.stderr)

    count = 0
    temp = len(tag_urls) // times
    #print(temp)
    #print(type(temp))
    threads = []
    while count < times:
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):(temp * (count + 1))],))
        threads.append(t)
        count += 1
    if (temp * count) < len(tag_urls):
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):len(tag_urls)]))
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    for url in tag_urls:
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))

    tag_urls.clear()
    root_urls.clear()
    #print('All completed.', file=sys.stderr)


def request_get(url):
    resp = session.get(url)
    #print(url)
    if resp.status_code is not 200:
        #print('404', file=sys.stderr)
        return 'pass'
    return BeautifulSoup(resp.text, 'lxml')


def scrape_list_page(soup):
    for a in BeautifulSoup(str(list(soup.select('.dise_list_title')[1].next_siblings)), 'html.parser').select('.link08'):
        target = a.get('href')
        tag_urls.append(target)
    page_tab = soup.find('div', class_='pageStyle')
    if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)


def process_task(targets):
    for url in targets:
        time.sleep(1)
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))


def save_txt(soup):
    title = re.sub(r'[\\/\:\*\?"\<\>\|]', '@', (soup.find('div', class_='navigator').find_all('a', class_='link04')[2].text)).strip()
    content = soup.find('ul', class_='detailc').text.strip()
    file_name = '{}.txt'.format(title)
    with open(file_name, 'w', encoding='utf-8', newline='') as f:
        f.write(content)

if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值