网络爬虫

最新推荐文章于 2023-08-13 17:43:02 发布

思考熊

最新推荐文章于 2023-08-13 17:43:02 发布

阅读量289

点赞数

分类专栏：项目技术文章标签： python 终端爬虫网页

本文链接：https://blog.csdn.net/jichangzhen/article/details/78207024

版权

项目技术专栏收录该内容

14 篇文章 1 订阅

订阅专栏

1.安装python（2和3不兼容）
2.以管理员身份运行终端，下载beautifulsoup4，执行命令：pip install beautifulsoup4
3.下载request ：pip install requests
4.分析网页代码结构：
简单代码如下：

import requests
from bs4 import BeautifulSoup

resp = requests.get("网址")
soup = BeautifulSoup(resp.text,'html.parser')
#分析网页，找到相应类和标签
title = soup.find('ul',class_='detaila').text.strip()
content = soup.find('ul',class_='detailc').text.strip()

file_name = '{}.txt'.format(title)
with open (file_name,'w',newline = '') as f:
    f.write(content)

开多线程爬取飞华网实例

import re
import sys
import time
import requests
import threading
from urllib import parse
from bs4 import BeautifulSoup


ori_url = 'http://dise.fh21.com.cn/department/illnesses.html'
session = requests.session()
root_urls = []  # 所有科室的绝对路径
tag_urls = []  # 所有的绝对路径
times = 16


def main():
    soup = request_get(ori_url)
    for root in soup.find_all('ul', class_='level2'):
        for tag in root.find_all('a', class_='link08 '):
            root_urls.append(parse.urljoin(ori_url, tag['href']))

    for url in root_urls:
        soup = request_get(url)
        if soup is 'pass':
            #print('Skip this one url above.', file=sys.stderr)
            continue
        list_root = soup.find('div', class_='dise_list')
        for a in list_root.find_all('a', class_='link08'):
            target = a.get('href')
            tag_urls.append(target)
        page_tab = soup.find('div', class_='pageStyle')
        if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)
    #print('A total of {} urls were scraped.'.format(len(tag_urls)), file=sys.stderr)
    #print('--------    Start saving...    --------', file=sys.stderr)

    count = 0
    temp = len(tag_urls) // times
    #print(temp)
    #print(type(temp))
    threads = []
    while count < times:
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):(temp * (count + 1))],))
        threads.append(t)
        count += 1
    if (temp * count) < len(tag_urls):
        t = threading.Thread(target=process_task, args=(tag_urls[(temp * count):len(tag_urls)]))
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    for url in tag_urls:
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))

    tag_urls.clear()
    root_urls.clear()
    #print('All completed.', file=sys.stderr)


def request_get(url):
    resp = session.get(url)
    #print(url)
    if resp.status_code is not 200:
        #print('404', file=sys.stderr)
        return 'pass'
    return BeautifulSoup(resp.text, 'lxml')


def scrape_list_page(soup):
    for a in BeautifulSoup(str(list(soup.select('.dise_list_title')[1].next_siblings)), 'html.parser').select('.link08'):
        target = a.get('href')
        tag_urls.append(target)
    page_tab = soup.find('div', class_='pageStyle')
    if page_tab:
            next_page = page_tab.find('span', class_='current').next_sibling
            if next_page:
                soup = request_get(parse.urljoin(ori_url, next_page.get('href')))
                scrape_list_page(soup)


def process_task(targets):
    for url in targets:
        time.sleep(1)
        soup = request_get(url)
        detail_url = parse.urljoin(url, soup.select('p[data-seq="3"] > a')[0].get('href'))
        save_txt(request_get(detail_url))


def save_txt(soup):
    title = re.sub(r'[\\/\:\*\?"\<\>\|]', '@', (soup.find('div', class_='navigator').find_all('a', class_='link04')[2].text)).strip()
    content = soup.find('ul', class_='detailc').text.strip()
    file_name = '{}.txt'.format(title)
    with open(file_name, 'w', encoding='utf-8', newline='') as f:
        f.write(content)

if __name__ == '__main__':
    main()

思考熊

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫

1.安装python（2和3不兼容） 2.以管理员身份运行终端，下载beautifulsoup4，执行命令：pip install beautifulsoup4 3.下载request ：pip install requests 4.分析网页代码结构：简单代码如下：import requestsfrom bs4 import BeautifulSoupresp = requests.ge
复制链接

扫一扫