python100行代码爬取古诗

最近在做AI作诗的项目,需要训练古诗生成模型,于是自己写了个爬虫获取古诗数据

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import xlwt
import re
import string
from urllib import request
from chardet import detect
import chardet
import numpy as np
import json
from os.path import join as pjoin
import os
import time
index = 0#数据集
poet = 0#诗人的序号
def getSoup(url):
    # """获取源码"""
    fp = request.urlopen(url)
    content = fp.read()
    fp.close()
    type(content)
    html = content.decode()  # 解码
    det = chardet.detect(content)  # 判断content的编码方式
    if det['confidence'] > 0.8:  # 当confidence>0.8时认定为判断正确
        html = content.decode(det['encoding'])
    else:
        html = content.decode('gbk')
    soup = BeautifulSoup(html)
    return soup
#下一页
def nextUrl(soup):
    # """获取下一页连接后缀"""
    a = soup.find('a', text=re.compile("^下一页"))
    if a:
        return a.attrs['href']
    else:
        return None
#一级页面【诗人列表】
def firstPage():
    start = time.clock()
    print("开始时间:",start)
    page = 1#页码
    nt = '/shiren/index.html'
    global poet
    while nt:
        print('------------------第'+str(page)+'页-------------')
        soup = getSoup('https://www.gushimi.org' + nt)
        ol = soup.findAll('div', attrs={"class": 'news_title'})
        for div in ol:
            print(str(poet)+":" + div.a.text)
            poet = poet +1
            secondPageUrl = 'https://www.gushimi.org'+div.a.attrs['href']
            secondPage(secondPageUrl)
            print('------------------此诗人爬取结束-------------')
        nt = nextUrl(soup)
        page = page + 1
    end = time.clock()
    print("结束时间:",end)
    print("总共用时:", end - start)
#二级页面【某诗人诗词列表】
def secondPage(url):
    soup = getSoup(url)
    ol = soup.findAll('div', attrs={"class": "content_box"})
    for li in ol[2].findAll("li"):
        thirdPageUrl = 'https://www.gushimi.org'+li.select('a')[0].attrs['href']
        thirdPage(thirdPageUrl)
#三级页面【诗词详细列表】
def thirdPage(url):
    global index
    soup = getSoup(url)
    str0 = soup.findAll('div', attrs={"class": "box_title"})
    str0 = str0[1].text#标题
    ol = soup.findAll('div', attrs={"class": "news_content"})
    str1 = ol[0].find_all("div", class_="old_h1")[0].select('a')[0].text#朝代
    str2 = ol[0].find_all("div", class_="old_h1")[0].select('a')[1].text#作者
    print("      "+str(index)+str2+":"+str0)
    contents = ol[0].find_all("div", class_="newstext")[0]
    str3 = []#诗文
    for li in contents.find_all("div"):
        str3.append(li.text)
    flags = ol[0].findAll("div", class_="newstext")[1]
    str4 = []#关键字
    for li in flags.find_all("a"):
        str4.append(li.text)
    jsonData = {'index': index, 'title': str0, 'dynasty': str1, 'author': str2, 'paragraphs': str3, 'key': str4}
    index = index+1
    write_in_json_data(jsonData)
#写入json文件
def write_in_json_data(jsonData):
    with open('result.json', 'a', encoding='utf-8') as f:
        json.dump(jsonData, f, ensure_ascii=False, indent=4)

if __name__ == '__main__':
    firstPage()


  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值