# encoding=utf-8
import requests
from bs4 import BeautifulSoup
import sqlite3
import uuid
conn = sqlite3.connect("idiomBase.db3") # 创建sqlite.db数据库
print("open database success")
conn.execute("drop table IF EXISTS idiom")
# id
# 成语名称
# 拼音
# 成语解释
# 成语出处
# 成语故事
# 近义词
# 反义词
# 简拼
query = """create table IF NOT EXISTS idiom(
id VARCHAR(50),
name VARCHAR(50),
pinyin VARCHAR(50),
description VARCHAR(500),
fromWhere VARCHAR(500),
story VARCHAR(500),
similar VARCHAR(500),
antonym VARCHAR(500),
simplicity VARCHAR(500)
);"""
conn.execute(query)
print("Table created successfully")
all_url = 'http://chengyu.t086.com'
# all_url = 'http://xh.5156edu.com'
# http请求头
HostReferer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': 'http://chengyu.t086.com/'
}
# W的数据有问题
word = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z']
# href="/cy0/102.html"
# 写入文件F:\\data.json中,如果文件不存在则创建,有则覆盖数据
try:
with open('F:\\data.json', 'w+') as file_data:
for w in word:
for n in range(1, 100):
url = all_url + '/list/' + w + '_' + str(n) + '.html'
# /cy10/10242.html 页面被禁止钓鱼,这个成语得过滤掉
start_html = requests.get(url, headers=HostReferer)
if start_html.status_code == 404:
break
start_html.encoding = 'gbk'
soup = BeautifulSoup(start_html.text, "html.parser")
listw = soup.find('div', class_='listw')
lista = listw.find_all('a')
# a href
# 获取所有的a标签,并遍历拼接href新的html页面
for item in listw.find_all("a"):
# 拼接成语详情页面的地址
if item.get("href") != "/cy10/10242.html":
detail_url = all_url + item.get("href")
second_html = requests.get(detail_url, headers=HostReferer)
if second_html.status_code == 404:
break
second_html.encoding = 'gbk'
soup = BeautifulSoup(second_html.text, "html.parser")
detail_data = soup.find('div', class_='mainbar')
# 找到div里面的 名称、拼音、解释、出处
tr_data = detail_data.find_all_next('td')
# print(tr_data.__len__())
# print(tr_data)
# index:1-成语名称 3-拼音 5-解释 7-出处 9-举例子 11-近义词 13-反义词
for index in range(len(tr_data)):
namespace = str(uuid.uuid1())
if index > 17:
break
if index % 2 != 0:
# print(index, tr_data[index].text)
if index == 1:
name_ = tr_data[index].text
# print('"name":', '"' + tr_data[index].text + '"')
if index == 3:
pinyin_ = tr_data[index].text
# print('"pinyin":', '"' + tr_data[index].text + '"')
if index == 5:
description_ = tr_data[index].text
# print('"description":', '"' + tr_data[index].text + '"')
if index == 7:
fromWhere_ = tr_data[index].text
# print('"fromWhere":', '"' + tr_data[index].text + '"')
# if index == 11:
# similar_ = tr_data[index].text
# if index == 13:
# antonym_ = tr_data[index].text
# ids = str(uuid.uuid1())
print('{"_id":', '"' + namespace.replace("-", "") + '"' + ',' +
'"_openid":', '"' + 'oy2gw5etftuMXk-iXS026X3orNcc' + '"' + ',' +
'"name":', '"' + name_ + '"' + ',' +
'"pinyin":', '"' + pinyin_ + '"' + ',' +
'"description":', '"' + description_ + '"' + ',' +
'"fromWhere":', '"' + fromWhere_ + '"}')
str_ = '{"_id":' + '"' + namespace.replace("-", "") + '"' + ',' + \
'"_openid":' + '"' + 'oy2gw5etftuMXk-iXS026X3orNcc' + '"' + ',' + \
'"name":' + '"' + name_ + '"' + ',' + \
'"pinyin":' + '"' + pinyin_ + '"' + ',' + \
'"description":' + '"' + description_ + '"' + ',' + \
'"fromWhere":' + '"' + fromWhere_ + '"}'
file_data.write(str_)
file_data.write('\n')
# query = "insert into idiom (id,name,pinyin,description,fromWhere,story,similar,antonym,simplicity)
# values ('" + ids + "','" + name_ + "','" + pinyin_ + "','" + description_ + "','"
# + fromWhere_ + "','""','" + similar_ + "','" + antonym_ + "','""');"
# print(query)
# conn.execute(query)
# conn.commit()
finally:
file_data.close()
以上是源代码,之前等它爬数据,但是中途一直会卡死,爬了几次发现每次到了某一个成语后面就卡死了,后来去网站上看这个成语,点击去是个防钓鱼页面,也就没有了成语格式了,所以不适用以上的爬虫格式,http://chengyu.t086.com/cy10/10242.html 这个页面无法访问,这里是把那个成语过滤掉了。