【Python】网络爬虫(静态网站)实例

本爬虫的特点:

1.目标:静态网站

2.级数:二级

3.线程:单线程(未采用同步,为了避免顺序错乱,因此采用单线程)

4.结果:爬取一部网络小说,将分散的各章节合并成一个txt文本文件

 

获取网页模板:

def get_url(url):
    try:
        response = requests.get(url)
        print(response.encoding)
        print(response.apparent_encoding)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        else:
            print("url Error:", url)
    except RequestException:
        print("URL RequestException Error:", url)
        return None

解析保存函数:

def parse_url(html):
    count = 0
    essay = ""
    pattern = re.compile('<td class="L"><a href="(.*?)">(.*?)</a></td>', re.S)
    items = re.findall(pattern, html)
    pattern_page = re.compile('<meta property="og:url" content="(.*?)"/>', re.S)
    item_page = re.findall(pattern_page, html)
    print(items)
    print(items.__len__())
    for item in items:
        count += 1
        if count <= 2416:
            continue
        this_url = item_page[0] + item[0]
        this_title = item[1]
        essay = get_book(this_url, this_title).replace("\ufffd", "*")
        try:
            if count % 100 == 1:
                file = open(sys.path[0]+"凡人修仙传.txt", "a")
            file.write(essay)
            if count % 100 == 0 or count == items.__len__():
                file.close()
                print("前"+str(count)+"章保存完毕!")
            print("下载到第 " + str(count) + "章", item, count / items.__len__() * 100, "%")
        except RequestException:
            # print("Error", item)
            print(essay)

完整代码:

import requests
from requests.exceptions import RequestException
import re
import sys
from multiprocessing import Pool
import sqlite3
import os


def get_url(url):
    try:
        response = requests.get(url)
        print(response.encoding)
        print(response.apparent_encoding)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        else:
            print("url Error:", url)
    except RequestException:
        print("URL RequestException Error:", url)
        return None


def parse_url(html):
    count = 0
    essay = ""
    pattern = re.compile('<td class="L"><a href="(.*?)">(.*?)</a></td>', re.S)
    items = re.findall(pattern, html)
    pattern_page = re.compile('<meta property="og:url" content="(.*?)"/>', re.S)
    item_page = re.findall(pattern_page, html)
    print(items)
    print(items.__len__())
    for item in items:
        count += 1
        if count <= 2416:
            continue
        this_url = item_page[0] + item[0]
        this_title = item[1]
        essay = get_book(this_url, this_title).replace("\ufffd", "*")
        try:
            if count % 100 == 1:
                file = open(sys.path[0]+"凡人修仙传.txt", "a")
            file.write(essay)
            if count % 100 == 0 or count == items.__len__():
                file.close()
                print("前"+str(count)+"章保存完毕!")
            print("下载到第 " + str(count) + "章", item, count / items.__len__() * 100, "%")
        except RequestException:
            # print("Error", item)
            print(essay)


def get_book(url, title):
    data = "\n" + str(title) + "\n"
    pattern = re.compile('<dd id="contents">(.*?)</dd>', re.S)
    essay = re.findall(pattern, get_url(url))
    essay_str = str(essay[0])
    data = data + essay_str.replace("&nbsp;", " ").replace("<br />", "\n")
    return data


if __name__ == '__main__':
    parse_url(get_url("https://www.x23us.com/html/0/328/"))

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值