【python爬虫】爬取Bing词典的单词存到SQLite数据库

爬取Bing词典的单词

打算做一个单词相关的app自己用,那词典从何而来呢?
想到了用爬虫。爬哪里的数据呢?
个人比较喜欢微软的东西,所以打算从Bing翻译爬取单词

Bug

由于Bing翻译的html还是比较复杂的,不可避免地,可能会遇到bug
但只要是一般的单词,应该是没有什么问题的,欢迎大家帮忙测试。
代码目前还很糙,也欢迎尝试代码的朋友们提出意见。
代码里的单词是从我本机SQLite数据库里取出来的,朋友们可以根据自己需要用其他方法获取单词,得到的结果也可以存到SQLite数据库或者是文档型数据库之类的(可能文档型数据库,比如mongDB比较适合这种单词的存储)

单词格式

Bing词典单词页面大概分了几个模块

音标phonetics

音标有美式usa和英式uk

翻译translations

翻译分词性和翻译

时态tenses

在这里插入图片描述
时态分 时态类型和单词
比如 过去分词:acquainted

近义词

词性、单词……
在这里插入图片描述

反义词

词性、单词……
在这里插入图片描述

Advanced E-C

在这里插入图片描述

E-C

在这里插入图片描述

例句Sample examples

在这里插入图片描述

源码

源码运行需要有一个SQLite型数据库test.db,里边有一个表dictionary
点此查看test.db和dictionary获取方法
代码运行过程会在test.db中创建其他的表,比如phonetics音标表,translations翻译表……具体什么结构朋友们运行自己看吧
说到SQLite数据库,我推荐大家用SQLite Expert Professional这个可视化界面,当然也可以用navicat

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2020/6/13 10:33
# @Author  : GiggleFan
# @FileName: collectWordsFromBing.py
# @Software: PyCharm
# @Blog    :https://blog.csdn.net/qq_40313347

import sqlite3

from fake_useragent import UserAgent
from lxml import etree
from urllib import request

def get_translations(html_object):
    result_translations = []
    html_translation = html_object.xpath('//div[@class="qdef"]/ul/li/span')
    for i in range(0, len(html_translation), 2):
        translations = Translation(html_translation[i].text, html_translation[i + 1].xpath('span')[0].text)
        result_translations.append(translations)
    return result_translations
def get_phonetics(html_object):
    phonetic = []
    result_phonetics_usa = ""
    result_phonetics_uk = ""
    usa = html_object.xpath('//div[@class="hd_prUS b_primtxt"]/text()')
    for item in usa:
        result_phonetics_usa = item

    uk = html_object.xpath('//div[@class="hd_pr b_primtxt"]/text()')
    for item in uk:
        result_phonetics_uk = item
    phonetic.append(result_phonetics_usa)
    phonetic.append(result_phonetics_uk)
    return phonetic
def get_tenses(html_object):
    result_tense = []
    html_tenses = html_object.xpath('//div[@class="hd_div1"]/div[@class="hd_if"]')
    for tense in html_tenses:
        for j in range(0, len(tense.xpath('span'))):
            # ten = Tense(tense.xpath('span')[j].text, tense.xpath('a')[j].text)
            ten = []
            ten.append(tense.xpath('span')[j].text)
            ten.append(tense.xpath('a')[j].text)
            result_tense.append(ten)
            # print(tense.xpath('span')[j].text+"||"+tense.xpath('a')[j].text)
    return result_tense
def get_Colls(html_object):
    result_coll = []
    colid = html_object.xpath('//div[@id="colid"]/div[@class="df_div2"]/div')
    for i in range(0, len(colid), 2):
        coll = []
        coll.append(colid[i].text)
        content = []
        for k in range(0, len(colid[i + 1].xpath('a/span'))):
            content.append(colid[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        coll.append(result)
        result_coll.append(coll)
    return result_coll
def get_synonym(html_object):
    result_synonym = []
    synonym = html_object.xpath('//div[@id="synoid"]/div[@class="df_div2"]/div')
    for i in range(0, len(synonym), 2):
        syno = []
        syno.append(synonym[i].text)
        content = []
        for k in range(0, len(synonym[i + 1].xpath('a/span'))):
            content.append(synonym[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        syno.append(result)
        result_synonym.append((syno))
    return result_synonym
def get_antonym(html_object):
    result_antonym = []
    antonym = html_object.xpath('//div[@id="antoid"]/div[@class="df_div2"]/div')
    for i in range(0, len(antonym), 2):
        an = []
        an.append(antonym[i].text)
        content = []
        for k in range(0, len(antonym[i + 1].xpath('a/span'))):
            content.append(antonym[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        an.append(result)
        result_antonym.append(an)
    return result_antonym
def get_ec(html_object):
    E_C_cixing = html_object.xpath('//div[@id="crossid"]//div[@class="pos pos1"]')
    cixing = []
    for e_c in E_C_cixing:
        cixing.append(e_c.text)

    result_ec = []
    E_C_fanyiall = html_object.xpath('//div[@id="crossid"]//div[@class="def_fl"]')
    for i in range(len(E_C_fanyiall)):
        fanyi = E_C_fanyiall[i].xpath('div/div[2]/span/text()')
        strfanyi = '|'.join(fanyi)
        ec = []
        ec.append(cixing[i])
        ec.append(strfanyi)
        result_ec.append(ec)
    return result_ec
def get_examples(html_object):
    result_examples = []
    examples_e = html_object.xpath('//div[@class="sen_en b_regtxt"]')
    examples_cn = html_object.xpath('//div[@class="sen_cn b_regtxt"]')
    if len(examples_e) == len(examples_cn):
        exam = []
        for i in examples_e:
            shuchu = i.xpath('*/text()')
            result = ''.join(shuchu)
            exam.append(result)
        exam2 = []
        for i in range(len(examples_cn)):
            ex = []
            shuchu = examples_cn[i].xpath('*/text()')
            result = ''.join(shuchu)
            ex.append(exam[i])
            ex.append(result)
            result_examples.append(ex)
    else:
        print("例句原文和翻译个数不对等。")
    return result_examples

def get_advanced_ec2(html_object):
    result_advanced_ec = []
    advanced_ecs = html_object.xpath('//div[@class="se_lis"]//div[@class="def_pa"]')
    for advanced_ec in advanced_ecs:
        ecs = advanced_ec.xpath('*//text()')
        result = ''.join(ecs)
        result_advanced_ec.append(result)
        # print(result)
    return result_advanced_ec

def get_advanced_ec(html_object):
    get_liju_e = html_object.xpath('//div[@class="li_pos"]//div[@class="val_ex"]')
    get_liju_cn = html_object.xpath('//div[@class="li_pos"]//div[@class="bil_ex"]')
    get_len = len(get_liju_e)
    result_advanced_ec = []
    for i in range(get_len):
        liju = []
        liju.append(get_liju_e[i].text)
        liju.append(get_liju_cn[i].text)
        result_advanced_ec.append(liju)
    return result_advanced_ec
def create_table():
    conn = sqlite3.connect('test.db')
    cursor = conn.cursor()
    cursor.execute(
        'create table if not exists phonetics(id  integer PRIMARY KEY autoincrement, word varchar(40), usa varchar(60), uk varchar(60))')
    cursor.execute(
        'create table if not exists translations(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(10), trans varchar(100))')
    cursor.execute(
        'create table if not exists tenses(id  integer PRIMARY KEY autoincrement, word varchar(40), tensetype varchar(40), tenseword varchar(40))')
    cursor.execute(
        'create table if not exists coll(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists synonym(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists antonym(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists advancedecs(id  integer PRIMARY KEY autoincrement, word varchar(40), en_cn varchar(400))')
    cursor.execute(
        'create table if not exists ec(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists examples(id  integer PRIMARY KEY autoincrement, word varchar(40), en varchar(400), cn varchar(400))')

    #print(cursor.rowcount)  # reusult 1
    cursor.close()
    conn.commit()
    conn.close()

class Translation:
    def __init__(self, partOfSpeech, trans:str):
        self.partOfSpeech = partOfSpeech
        self.trans = trans
class Tense:
    def _init_(self, type, word):
        self.type = type
        self.word = word
class Phonetics:
    def _init_(self, usa, uk):
        self.usa = usa
        self.uk = uk
def main():
    words = []
    conn = sqlite3.connect('test.db')
    cursor = conn.cursor()
    cursor.execute('select word from dictionary')
    for cur in cursor:
        words.append(cur[0])
    cursor.close()
    conn.commit()
    conn.close()
    for word in words:
        conn = sqlite3.connect('test.db')
        cursor = conn.cursor()
        word = word.rstrip()
        print('**********************************************\n' + word)
        ua = UserAgent()
        headers = {
            'User-Agent': ua.random
        }
        if True:
            url = 'http://cn.bing.com/dict/search?q=' + word
            req = request.Request(url, None, headers)
            data = ''
            with request.urlopen(req) as uf:
                while True:
                    data_temp = uf.read(1024)
                    if not data_temp:
                        break
                    data += data_temp.decode('utf-8', 'ignore')

            if data:
                html_object = etree.HTML(data)                  # 转换为html对象,以便进行path查找
                last_html_data = etree.tostring(html_object)    # 补全网页字符串
                html_object = etree.HTML(last_html_data)        # 再次转换为html对象,以便进行path查找
                # -------------------------------音标---------------------------------------------------------------------
                phonetics_result = get_phonetics(html_object)
                print(phonetics_result)
                phonetics_result[0]= phonetics_result[0].replace('\'', '#')
                phonetics_result[0]= phonetics_result[0].replace('#', '\'\'')
                phonetics_result[1]= phonetics_result[1].replace('\'', '#')
                phonetics_result[1]= phonetics_result[1].replace('#', '\'\'')
                insert_phonetic = 'insert into phonetics (word, usa, uk) values (\'' + word + '\',' + '\'' + phonetics_result[0] + '\',' + '\'' + phonetics_result[1] + '\')'
                # print(insert)
                cursor.execute(insert_phonetic)
                print("------------------------")
                # -------------------------------翻译---------------------------------------------------------------------
                translations_result = get_translations(html_object)
                for trans in translations_result:
                    print(trans.partOfSpeech, trans.trans)
                    # 这里knife单词有时可能会取到NoneType这种类型,所以进行类型转换。不知道是不是因为bing词典的缘故
                    trans.trans = str(trans.trans).replace('\'', '#')
                    trans.trans = trans.trans.replace('#', '\'\'')
                    insert_trans = 'insert into translations (word, partofspeech, trans) values (\'' + word + '\',' + '\'' + trans.partOfSpeech + '\',' + '\'' + trans.trans + '\')'
                    cursor.execute(insert_trans)
                print("------------------------")
                # -------------------------------时态---------------------------------------------------------------------
                tenses_result = get_tenses(html_object)
                for tense in tenses_result:
                    print(tense)
                    tense[1] = tense[1].replace('\'', '#')
                    tense[1] = tense[1].replace('#', '\'\'')
                    insert_tense = 'insert into tenses (word, tensetype, tenseword) values (\'' + word + '\',' + '\'' + tense[0] + '\',' + '\'' + tense[1] + '\')'
                    cursor.execute(insert_tense)
                print("------------------------")
                # -------------------------------Coll---------------------------------------------------------------------
                print("Coll.")
                Coll_result = get_Colls(html_object)
                for co in Coll_result:
                    print(co)
                    insert_coll = 'insert into coll (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                   co[0] + '\',' + '\'' + co[1] + '\')'
                    cursor.execute(insert_coll)
                print("------------------------")
                # -------------------------------近义词---------------------------------------------------------------------
                print("Synonym.")
                synonym_reslut = get_synonym(html_object)
                for sy in synonym_reslut:
                    print(sy)
                    sy[1] = sy[1].replace('\'', '#')
                    sy[1] = sy[1].replace('#', '\'\'')
                    insert_synonym = 'insert into synonym (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                  sy[0] + '\',' + '\'' + sy[1] + '\')'
                    # print(insert_synonym)
                    cursor.execute(insert_synonym)
                print("------------------------")
                # -------------------------------反义词---------------------------------------------------------------------
                print("Antonym.")
                antonym_result = get_antonym(html_object)
                for ant in antonym_result:
                    print(ant)
                    insert_antonym = 'insert into antonym (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                     ant[0] + '\',' + '\'' + ant[1] + '\')'
                    cursor.execute(insert_antonym)
                print("------------------------")
                # -------------------------------Advanced E-C---------------------------------------------------------------------
                # print("Advanced E-C")
                # advanced_ec_result = get_advanced_ec(html_object)
                # for adec in advanced_ec_result:
                #     print(adec)
                #     adec[0] = adec[0].replace('\'', '#')
                #     adec[0] = adec[0].replace('#', '\'\'')
                #     adec[1] = adec[1].replace('\'', '#')
                #     adec[1] = adec[1].replace('#', '\'\'')
                #     insert_advanced = 'insert into advancedec (word, en, cn) values (\'' + word + '\',' + '\'' + \
                #                      adec[0] + '\',' + '\'' + adec[1] + '\')'
                #     cursor.execute(insert_advanced)
                # print("------------------------")

                print("AdvancedEC")
                advanced_ec_result = get_advanced_ec2(html_object)
                for adec in advanced_ec_result:
                    print(adec)
                    adec = adec.replace('\'', '#')
                    adec = adec.replace('#', '\'\'')
                    # print(adec)
                    insert_advanced = 'insert into advancedecs (word, en_cn) values (\'' + word + '\',' + '\'' + adec + '\')'
                    cursor.execute(insert_advanced)
                print("------------------------")
                # -------------------------------E-C---------------------------------------------------------------------
                print("E-C")
                ec_result = get_ec(html_object)
                for ee in ec_result:
                    print(ee)
                    ee[1] = ee[1].replace('\'', '#')
                    ee[1] = ee[1].replace('#', '\'\'')
                    insert_ec = 'insert into ec (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                     ee[0] + '\',' + '\'' + ee[1] + '\')'
                    # print(insert_ec)
                    cursor.execute(insert_ec)
                print("------------------------")

                # -------------------------------Sample Examples---------------------------------------------------------------------
                print("例句")
                example_result = get_examples(html_object)
                for exam in example_result:
                    print(exam)
                    exam[0] = exam[0].replace('\'', '#')
                    exam[0] = exam[0].replace('#', '\'\'')
                    exam[1] = exam[1].replace('\'', '#')
                    exam[1] = exam[1].replace('#', '\'\'')
                    insert_example = 'insert into examples (word, en, cn) values (\'' + word + '\',' + '\'' + \
                                      exam[0] + '\',' + '\'' + exam[1] + '\')'
                    cursor.execute(insert_example)
        cursor.close()
        conn.commit()
        conn.close()

if __name__ == '__main__':
    create_table()
    main()

再次说明,我的单词是从本地数据库SQLite中取得,以前还从文件中取过。

单词怎么来的?

从文章中取出来的。
我还有一个python代码,功能是读取txt文件去除其中的所有单词并去重,存到文件或SQLite数据库中。最初是下载了好多英文小说,但是取出的单词有些仅仅是字母的序列,没有任何意义。后来又网上找到了常用七千多个单词的txt和十万个单词的txt,从其中取出的单词一般就没有问题了。

单词提取代码

点此查看另一篇博客

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值