python爬取成语网站数据

最新推荐文章于 2021-07-16 21:39:40 发布

Royal niver give up

最新推荐文章于 2021-07-16 21:39:40 发布

阅读量526

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/qq_34106328/article/details/106081715

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# encoding=utf-8

import requests
from bs4 import BeautifulSoup
import sqlite3
import uuid

conn = sqlite3.connect("idiomBase.db3")  # 创建sqlite.db数据库
print("open database success")
conn.execute("drop table IF EXISTS idiom")
# id
# 成语名称
# 拼音
# 成语解释
# 成语出处
# 成语故事
# 近义词
# 反义词
# 简拼
query = """create table IF NOT EXISTS idiom(
    id VARCHAR(50),             
    name VARCHAR(50),           
    pinyin VARCHAR(50),         
    description VARCHAR(500),   
    fromWhere VARCHAR(500),              
    story VARCHAR(500),         
    similar VARCHAR(500),       
    antonym VARCHAR(500),       
    simplicity VARCHAR(500)    
);"""
conn.execute(query)
print("Table created successfully")

all_url = 'http://chengyu.t086.com'
# all_url = 'http://xh.5156edu.com'

# http请求头
HostReferer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://chengyu.t086.com/'
}

# W的数据有问题
word = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
        'W', 'X', 'Y', 'Z']

# href="/cy0/102.html"

# 写入文件F:\\data.json中，如果文件不存在则创建，有则覆盖数据
try:
    with open('F:\\data.json', 'w+') as file_data:
        for w in word:

            for n in range(1, 100):

                url = all_url + '/list/' + w + '_' + str(n) + '.html'

                # /cy10/10242.html 页面被禁止钓鱼，这个成语得过滤掉

                start_html = requests.get(url, headers=HostReferer)
                if start_html.status_code == 404:
                    break
                start_html.encoding = 'gbk'
                soup = BeautifulSoup(start_html.text, "html.parser")

                listw = soup.find('div', class_='listw')

                lista = listw.find_all('a')

                # a href
                # 获取所有的a标签，并遍历拼接href新的html页面
                for item in listw.find_all("a"):
                    # 拼接成语详情页面的地址
                    if item.get("href") != "/cy10/10242.html":
                        detail_url = all_url + item.get("href")

                        second_html = requests.get(detail_url, headers=HostReferer)
                        if second_html.status_code == 404:
                            break
                        second_html.encoding = 'gbk'
                        soup = BeautifulSoup(second_html.text, "html.parser")

                        detail_data = soup.find('div', class_='mainbar')
                        # 找到div里面的 名称、拼音、解释、出处
                        tr_data = detail_data.find_all_next('td')

                        # print(tr_data.__len__())
                        # print(tr_data)

                        # index:1-成语名称 3-拼音 5-解释 7-出处 9-举例子 11-近义词 13-反义词
                        for index in range(len(tr_data)):
                            namespace = str(uuid.uuid1())
                            if index > 17:
                                break
                            if index % 2 != 0:
                                # print(index, tr_data[index].text)
                                if index == 1:
                                    name_ = tr_data[index].text
                                    # print('"name":', '"' + tr_data[index].text + '"')
                                if index == 3:
                                    pinyin_ = tr_data[index].text
                                    # print('"pinyin":', '"' + tr_data[index].text + '"')
                                if index == 5:
                                    description_ = tr_data[index].text
                                    # print('"description":', '"' + tr_data[index].text + '"')
                                if index == 7:
                                    fromWhere_ = tr_data[index].text
                                    # print('"fromWhere":', '"' + tr_data[index].text + '"')
                                # if index == 11:
                                #     similar_ = tr_data[index].text
                                # if index == 13:
                                #     antonym_ = tr_data[index].text
                                # ids = str(uuid.uuid1())
                        print('{"_id":', '"' + namespace.replace("-", "") + '"' + ',' +
                              '"_openid":', '"' + 'oy2gw5etftuMXk-iXS026X3orNcc' + '"' + ',' +
                              '"name":', '"' + name_ + '"' + ',' +
                              '"pinyin":', '"' + pinyin_ + '"' + ',' +
                              '"description":', '"' + description_ + '"' + ',' +
                              '"fromWhere":', '"' + fromWhere_ + '"}')
                        str_ = '{"_id":' + '"' + namespace.replace("-", "") + '"' + ',' + \
                               '"_openid":' + '"' + 'oy2gw5etftuMXk-iXS026X3orNcc' + '"' + ',' + \
                               '"name":' + '"' + name_ + '"' + ',' + \
                               '"pinyin":' + '"' + pinyin_ + '"' + ',' + \
                               '"description":' + '"' + description_ + '"' + ',' + \
                               '"fromWhere":' + '"' + fromWhere_ + '"}'
                        file_data.write(str_)
                        file_data.write('\n')
                        # query = "insert into idiom (id,name,pinyin,description,fromWhere,story,similar,antonym,simplicity)
                        # values ('" + ids + "','" + name_ + "','" + pinyin_ + "','" + description_ + "','"
                        # + fromWhere_ + "','""','" + similar_ + "','" + antonym_ + "','""');"
                        # print(query)
                        # conn.execute(query)
                        # conn.commit()
finally:
    file_data.close()

以上是源代码，之前等它爬数据，但是中途一直会卡死，爬了几次发现每次到了某一个成语后面就卡死了，后来去网站上看这个成语，点击去是个防钓鱼页面，也就没有了成语格式了，所以不适用以上的爬虫格式，http://chengyu.t086.com/cy10/10242.html 这个页面无法访问，这里是把那个成语过滤掉了。

Royal niver give up

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬取成语网站数据

# encoding=utf-8import requestsfrom bs4 import BeautifulSoupimport sqlite3import uuidconn = sqlite3.connect("idiomBase.db3") # 创建sqlite.db数据库print("open database success")conn.execute("drop table IF EXISTS idiom")# id# 成语名称# 拼音# 成语解释# 成语出处#
复制链接

扫一扫