【python爬虫】爬取Bing词典的单词存到SQLite数据库

最新推荐文章于 2021-06-28 23:16:00 发布

Leslie-D

最新推荐文章于 2021-06-28 23:16:00 发布

阅读量1.7k

点赞数 3

分类专栏： Python 文章标签： python xpath

本文链接：https://blog.csdn.net/qq_40313347/article/details/106446100

版权

Python 专栏收录该内容

5 篇文章 1 订阅

订阅专栏

爬取Bing词典的单词

打算做一个单词相关的app自己用，那词典从何而来呢？
想到了用爬虫。爬哪里的数据呢？
个人比较喜欢微软的东西，所以打算从Bing翻译爬取单词

Bug

由于Bing翻译的html还是比较复杂的，不可避免地，可能会遇到bug
但只要是一般的单词，应该是没有什么问题的，欢迎大家帮忙测试。
代码目前还很糙，也欢迎尝试代码的朋友们提出意见。
代码里的单词是从我本机SQLite数据库里取出来的，朋友们可以根据自己需要用其他方法获取单词，得到的结果也可以存到SQLite数据库或者是文档型数据库之类的（可能文档型数据库，比如mongDB比较适合这种单词的存储）

单词格式

Bing词典单词页面大概分了几个模块

音标phonetics

音标有美式usa和英式uk

翻译translations

翻译分词性和翻译

时态tenses

在这里插入图片描述
时态分时态类型和单词
比如过去分词：acquainted

近义词

词性、单词……
在这里插入图片描述

反义词

词性、单词……
在这里插入图片描述

Advanced E-C

在这里插入图片描述

E-C

在这里插入图片描述

例句Sample examples

在这里插入图片描述

源码

源码运行需要有一个SQLite型数据库test.db，里边有一个表dictionary
点此查看test.db和dictionary获取方法
代码运行过程会在test.db中创建其他的表，比如phonetics音标表，translations翻译表……具体什么结构朋友们运行自己看吧
说到SQLite数据库，我推荐大家用SQLite Expert Professional这个可视化界面，当然也可以用navicat

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2020/6/13 10:33
# @Author  : GiggleFan
# @FileName: collectWordsFromBing.py
# @Software: PyCharm
# @Blog    ：https://blog.csdn.net/qq_40313347

import sqlite3

from fake_useragent import UserAgent
from lxml import etree
from urllib import request

def get_translations(html_object):
    result_translations = []
    html_translation = html_object.xpath('//div[@class="qdef"]/ul/li/span')
    for i in range(0, len(html_translation), 2):
        translations = Translation(html_translation[i].text, html_translation[i + 1].xpath('span')[0].text)
        result_translations.append(translations)
    return result_translations
def get_phonetics(html_object):
    phonetic = []
    result_phonetics_usa = ""
    result_phonetics_uk = ""
    usa = html_object.xpath('//div[@class="hd_prUS b_primtxt"]/text()')
    for item in usa:
        result_phonetics_usa = item

    uk = html_object.xpath('//div[@class="hd_pr b_primtxt"]/text()')
    for item in uk:
        result_phonetics_uk = item
    phonetic.append(result_phonetics_usa)
    phonetic.append(result_phonetics_uk)
    return phonetic
def get_tenses(html_object):
    result_tense = []
    html_tenses = html_object.xpath('//div[@class="hd_div1"]/div[@class="hd_if"]')
    for tense in html_tenses:
        for j in range(0, len(tense.xpath('span'))):
            # ten = Tense(tense.xpath('span')[j].text, tense.xpath('a')[j].text)
            ten = []
            ten.append(tense.xpath('span')[j].text)
            ten.append(tense.xpath('a')[j].text)
            result_tense.append(ten)
            # print(tense.xpath('span')[j].text+"||"+tense.xpath('a')[j].text)
    return result_tense
def get_Colls(html_object):
    result_coll = []
    colid = html_object.xpath('//div[@id="colid"]/div[@class="df_div2"]/div')
    for i in range(0, len(colid), 2):
        coll = []
        coll.append(colid[i].text)
        content = []
        for k in range(0, len(colid[i + 1].xpath('a/span'))):
            content.append(colid[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        coll.append(result)
        result_coll.append(coll)
    return result_coll
def get_synonym(html_object):
    result_synonym = []
    synonym = html_object.xpath('//div[@id="synoid"]/div[@class="df_div2"]/div')
    for i in range(0, len(synonym), 2):
        syno = []
        syno.append(synonym[i].text)
        content = []
        for k in range(0, len(synonym[i + 1].xpath('a/span'))):
            content.append(synonym[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        syno.append(result)
        result_synonym.append((syno))
    return result_synonym
def get_antonym(html_object):
    result_antonym = []
    antonym = html_object.xpath('//div[@id="antoid"]/div[@class="df_div2"]/div')
    for i in range(0, len(antonym), 2):
        an = []
        an.append(antonym[i].text)
        content = []
        for k in range(0, len(antonym[i + 1].xpath('a/span'))):
            content.append(antonym[i + 1].xpath('a/span')[k].text)
        result = '; '.join(content)
        an.append(result)
        result_antonym.append(an)
    return result_antonym
def get_ec(html_object):
    E_C_cixing = html_object.xpath('//div[@id="crossid"]//div[@class="pos pos1"]')
    cixing = []
    for e_c in E_C_cixing:
        cixing.append(e_c.text)

    result_ec = []
    E_C_fanyiall = html_object.xpath('//div[@id="crossid"]//div[@class="def_fl"]')
    for i in range(len(E_C_fanyiall)):
        fanyi = E_C_fanyiall[i].xpath('div/div[2]/span/text()')
        strfanyi = '|'.join(fanyi)
        ec = []
        ec.append(cixing[i])
        ec.append(strfanyi)
        result_ec.append(ec)
    return result_ec
def get_examples(html_object):
    result_examples = []
    examples_e = html_object.xpath('//div[@class="sen_en b_regtxt"]')
    examples_cn = html_object.xpath('//div[@class="sen_cn b_regtxt"]')
    if len(examples_e) == len(examples_cn):
        exam = []
        for i in examples_e:
            shuchu = i.xpath('*/text()')
            result = ''.join(shuchu)
            exam.append(result)
        exam2 = []
        for i in range(len(examples_cn)):
            ex = []
            shuchu = examples_cn[i].xpath('*/text()')
            result = ''.join(shuchu)
            ex.append(exam[i])
            ex.append(result)
            result_examples.append(ex)
    else:
        print("例句原文和翻译个数不对等。")
    return result_examples

def get_advanced_ec2(html_object):
    result_advanced_ec = []
    advanced_ecs = html_object.xpath('//div[@class="se_lis"]//div[@class="def_pa"]')
    for advanced_ec in advanced_ecs:
        ecs = advanced_ec.xpath('*//text()')
        result = ''.join(ecs)
        result_advanced_ec.append(result)
        # print(result)
    return result_advanced_ec

def get_advanced_ec(html_object):
    get_liju_e = html_object.xpath('//div[@class="li_pos"]//div[@class="val_ex"]')
    get_liju_cn = html_object.xpath('//div[@class="li_pos"]//div[@class="bil_ex"]')
    get_len = len(get_liju_e)
    result_advanced_ec = []
    for i in range(get_len):
        liju = []
        liju.append(get_liju_e[i].text)
        liju.append(get_liju_cn[i].text)
        result_advanced_ec.append(liju)
    return result_advanced_ec
def create_table():
    conn = sqlite3.connect('test.db')
    cursor = conn.cursor()
    cursor.execute(
        'create table if not exists phonetics(id  integer PRIMARY KEY autoincrement, word varchar(40), usa varchar(60), uk varchar(60))')
    cursor.execute(
        'create table if not exists translations(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(10), trans varchar(100))')
    cursor.execute(
        'create table if not exists tenses(id  integer PRIMARY KEY autoincrement, word varchar(40), tensetype varchar(40), tenseword varchar(40))')
    cursor.execute(
        'create table if not exists coll(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists synonym(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists antonym(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists advancedecs(id  integer PRIMARY KEY autoincrement, word varchar(40), en_cn varchar(400))')
    cursor.execute(
        'create table if not exists ec(id  integer PRIMARY KEY autoincrement, word varchar(40), partofspeech varchar(40), content varchar(200))')
    cursor.execute(
        'create table if not exists examples(id  integer PRIMARY KEY autoincrement, word varchar(40), en varchar(400), cn varchar(400))')

    #print(cursor.rowcount)  # reusult 1
    cursor.close()
    conn.commit()
    conn.close()

class Translation:
    def __init__(self, partOfSpeech, trans:str):
        self.partOfSpeech = partOfSpeech
        self.trans = trans
class Tense:
    def _init_(self, type, word):
        self.type = type
        self.word = word
class Phonetics:
    def _init_(self, usa, uk):
        self.usa = usa
        self.uk = uk
def main():
    words = []
    conn = sqlite3.connect('test.db')
    cursor = conn.cursor()
    cursor.execute('select word from dictionary')
    for cur in cursor:
        words.append(cur[0])
    cursor.close()
    conn.commit()
    conn.close()
    for word in words:
        conn = sqlite3.connect('test.db')
        cursor = conn.cursor()
        word = word.rstrip()
        print('**********************************************\n' + word)
        ua = UserAgent()
        headers = {
            'User-Agent': ua.random
        }
        if True:
            url = 'http://cn.bing.com/dict/search?q=' + word
            req = request.Request(url, None, headers)
            data = ''
            with request.urlopen(req) as uf:
                while True:
                    data_temp = uf.read(1024)
                    if not data_temp:
                        break
                    data += data_temp.decode('utf-8', 'ignore')

            if data:
                html_object = etree.HTML(data)                  # 转换为html对象，以便进行path查找
                last_html_data = etree.tostring(html_object)    # 补全网页字符串
                html_object = etree.HTML(last_html_data)        # 再次转换为html对象，以便进行path查找
                # -------------------------------音标---------------------------------------------------------------------
                phonetics_result = get_phonetics(html_object)
                print(phonetics_result)
                phonetics_result[0]= phonetics_result[0].replace('\'', '#')
                phonetics_result[0]= phonetics_result[0].replace('#', '\'\'')
                phonetics_result[1]= phonetics_result[1].replace('\'', '#')
                phonetics_result[1]= phonetics_result[1].replace('#', '\'\'')
                insert_phonetic = 'insert into phonetics (word, usa, uk) values (\'' + word + '\',' + '\'' + phonetics_result[0] + '\',' + '\'' + phonetics_result[1] + '\')'
                # print(insert)
                cursor.execute(insert_phonetic)
                print("------------------------")
                # -------------------------------翻译---------------------------------------------------------------------
                translations_result = get_translations(html_object)
                for trans in translations_result:
                    print(trans.partOfSpeech, trans.trans)
                    # 这里knife单词有时可能会取到NoneType这种类型，所以进行类型转换。不知道是不是因为bing词典的缘故
                    trans.trans = str(trans.trans).replace('\'', '#')
                    trans.trans = trans.trans.replace('#', '\'\'')
                    insert_trans = 'insert into translations (word, partofspeech, trans) values (\'' + word + '\',' + '\'' + trans.partOfSpeech + '\',' + '\'' + trans.trans + '\')'
                    cursor.execute(insert_trans)
                print("------------------------")
                # -------------------------------时态---------------------------------------------------------------------
                tenses_result = get_tenses(html_object)
                for tense in tenses_result:
                    print(tense)
                    tense[1] = tense[1].replace('\'', '#')
                    tense[1] = tense[1].replace('#', '\'\'')
                    insert_tense = 'insert into tenses (word, tensetype, tenseword) values (\'' + word + '\',' + '\'' + tense[0] + '\',' + '\'' + tense[1] + '\')'
                    cursor.execute(insert_tense)
                print("------------------------")
                # -------------------------------Coll---------------------------------------------------------------------
                print("Coll.")
                Coll_result = get_Colls(html_object)
                for co in Coll_result:
                    print(co)
                    insert_coll = 'insert into coll (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                   co[0] + '\',' + '\'' + co[1] + '\')'
                    cursor.execute(insert_coll)
                print("------------------------")
                # -------------------------------近义词---------------------------------------------------------------------
                print("Synonym.")
                synonym_reslut = get_synonym(html_object)
                for sy in synonym_reslut:
                    print(sy)
                    sy[1] = sy[1].replace('\'', '#')
                    sy[1] = sy[1].replace('#', '\'\'')
                    insert_synonym = 'insert into synonym (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                  sy[0] + '\',' + '\'' + sy[1] + '\')'
                    # print(insert_synonym)
                    cursor.execute(insert_synonym)
                print("------------------------")
                # -------------------------------反义词---------------------------------------------------------------------
                print("Antonym.")
                antonym_result = get_antonym(html_object)
                for ant in antonym_result:
                    print(ant)
                    insert_antonym = 'insert into antonym (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                     ant[0] + '\',' + '\'' + ant[1] + '\')'
                    cursor.execute(insert_antonym)
                print("------------------------")
                # -------------------------------Advanced E-C---------------------------------------------------------------------
                # print("Advanced E-C")
                # advanced_ec_result = get_advanced_ec(html_object)
                # for adec in advanced_ec_result:
                #     print(adec)
                #     adec[0] = adec[0].replace('\'', '#')
                #     adec[0] = adec[0].replace('#', '\'\'')
                #     adec[1] = adec[1].replace('\'', '#')
                #     adec[1] = adec[1].replace('#', '\'\'')
                #     insert_advanced = 'insert into advancedec (word, en, cn) values (\'' + word + '\',' + '\'' + \
                #                      adec[0] + '\',' + '\'' + adec[1] + '\')'
                #     cursor.execute(insert_advanced)
                # print("------------------------")

                print("AdvancedEC")
                advanced_ec_result = get_advanced_ec2(html_object)
                for adec in advanced_ec_result:
                    print(adec)
                    adec = adec.replace('\'', '#')
                    adec = adec.replace('#', '\'\'')
                    # print(adec)
                    insert_advanced = 'insert into advancedecs (word, en_cn) values (\'' + word + '\',' + '\'' + adec + '\')'
                    cursor.execute(insert_advanced)
                print("------------------------")
                # -------------------------------E-C---------------------------------------------------------------------
                print("E-C")
                ec_result = get_ec(html_object)
                for ee in ec_result:
                    print(ee)
                    ee[1] = ee[1].replace('\'', '#')
                    ee[1] = ee[1].replace('#', '\'\'')
                    insert_ec = 'insert into ec (word, partofspeech, content) values (\'' + word + '\',' + '\'' + \
                                     ee[0] + '\',' + '\'' + ee[1] + '\')'
                    # print(insert_ec)
                    cursor.execute(insert_ec)
                print("------------------------")

                # -------------------------------Sample Examples---------------------------------------------------------------------
                print("例句")
                example_result = get_examples(html_object)
                for exam in example_result:
                    print(exam)
                    exam[0] = exam[0].replace('\'', '#')
                    exam[0] = exam[0].replace('#', '\'\'')
                    exam[1] = exam[1].replace('\'', '#')
                    exam[1] = exam[1].replace('#', '\'\'')
                    insert_example = 'insert into examples (word, en, cn) values (\'' + word + '\',' + '\'' + \
                                      exam[0] + '\',' + '\'' + exam[1] + '\')'
                    cursor.execute(insert_example)
        cursor.close()
        conn.commit()
        conn.close()

if __name__ == '__main__':
    create_table()
    main()