爬单词

爬的单词信息

# -*- coding: utf-8 -*-
import openpyxl
import pymysql
import json
import sys
from urllib.parse import urlparse, quote, urlencode, unquote
from urllib.request import urlopen
import re
import urllib.request
from lxml import etree
import re
import time
import requests
from functools import reduce

db = pymysql.connect("localhost", "root", "123456", "STUDY", charset="utf8")
cursor = db.cursor()

#形参为单词、单词类型(PET/Unlock3/PET.....)
class Phrase():
    def __init__(self,word,word_type_id):
        self.word = word
        self.word_type_id = word_type_id

        if self.word_type_id == 2:
            self.word_type = 'Unlock3'
        elif self.word_type_id == 1:
            self.word_type = 'PET'
        elif self.word_type_id == 3:
            self.word_type = '中考短语'
        elif self.word_type_id == 4:
            self.word_type = 'FCE'


    #插入英语单词信息表t_english_info
    def load_word_mysql(self):
        sql_del = "delete from t_english_info where word = '%s' and word_type_id =%s"%(self.word,self.word_type_id)
        cursor.execute(sql_del)
        db.commit()

        sql ="select max(w_id) from t_english_info where word_type_id = %s "%(self.word_type_id)
        cursor.execute(sql)
        db.commit()
        data_max_wid = cursor.fetchall()
        self.word_id = int(data_max_wid[0][0]) + 1

        sql_insert_word = "insert into t_english_info(w_id,word,word_type_id,word_type) values(%s,'%s',%s,'%s')" % (self.word_id,self.word,self.word_type_id,self.word_type)
        cursor.execute(sql_insert_word)
        db.commit()

    #获取中文释义并更新到数据库中
    def get_word_cn(self):
        self.query = {'q': "".join(self.word)}  # list --> str: "".join(list)
        self.url = 'https://fanyi.youdao.com/openapi.do?keyfrom=11pegasus11&key=273646050&type=data&doctype=json&version=1.1&' + urlencode(self.query)
        self.response = urlopen(self.url, timeout=3)
        self.html = self.response.read().decode('utf-8')

        self.d = json.loads(self.html)
        if self.d.get('errorCode') == 0:
            if self.d.get('basic'):
                self.explains = self.d.get('basic').get('explains')
                self.result_cn = str(self.explains).replace('\'', "").replace('[', "").replace(']', "")
            else:
                self.result_cn = "未查到释义"
        else:
            self.result_cn = '未查到释义'

        sql_update_cn = "update t_english_info set word_c = '%s' where word = '%s' and word_type_id = %s "%(self.result_cn,self.word ,self.word_type_id)
        cursor.execute(sql_update_cn)
        db.commit()

    #获取音标和读音
    def get_symbol_voice(self):

        self.searchurl = 'http://cn.bing.com/dict/search?q='+self.word.replace(' ', '%20')
        self.response = urllib.request.urlopen(self.searchurl)
        self.html = self.response.read()

        self.get_symbol_list = []
        self.symbol_xpath = '/html/body/div[1]/div/div/div[1]/div[1]/div[1]/div[2]/div'
        self.reobj1 = re.compile("(https\:.*?mp3)", re.I | re.M | re.S)

        self.selector = etree.HTML(self.html.decode('utf-8'))

        self.get_symbol_tmp = self.selector.xpath(self.symbol_xpath)
        for item in self.get_symbol_tmp:
            it = item.xpath('div')
            if len(it) > 0:
                ddd = self.reobj1.findall(it[1].xpath('a')[0].get('onmouseover', None))
                self.get_symbol_list.append(it[0].text)
                self.get_symbol_list.append(ddd[0])
                ddd = self.reobj1.findall(it[3].xpath('a')[0].get('onmouseover', None))
                self.get_symbol_list.append(it[2].text)
                self.get_symbol_list.append(ddd[0])
        if len(self.get_symbol_list) > 0:

            sql_update_symbol = "update t_english_info \
                                set America_sign = '%s', \
                                    America_voice = '%s', \
                                    English_sign = '%s', \
                                    English_voice = '%s' \
                                where word = '%s'  \
                                and word_type_id =%s " % (pymysql.escape_string(self.get_symbol_list[0]),pymysql.escape_string(self.get_symbol_list[1]), \
                                                          pymysql.escape_string(self.get_symbol_list[2]),pymysql.escape_string(self.get_symbol_list[3]),self.word, self.word_type_id)

            cursor.execute(sql_update_symbol)
            db.commit()
        else:
            sql_update_symbol = "update t_english_info \
                                            set America_sign = ' ', \
                                             America_voice = ' ', \
                                             English_sign = ' ', \
                                             English_voice = ' ' \
                                            where word = '%s'  \
                                            and word_type_id =%s " % (self.word, self.word_type_id)
            cursor.execute(sql_update_symbol)
            db.commit()

        # 获取释义
        self.examples = []
        self.get_example_en = self.selector.xpath('//*[@class="val_ex"]')
        self.get_example_cn = self.selector.xpath('//*[@class="bil_ex"]')
        get_len = len(self.get_example_en)
        for i in range(get_len):
            self.examples.append("%s||%s" % (self.get_example_en[i].text, self.get_example_cn[i].text))
        if len(self.examples) > 0:
            self.var_examples = reduce(lambda x, y: "%s||||%s" % (x, y), self.examples).replace('||||',
                                                                                                    '\n').replace('||',
                                                                                                                  ' ')

            sql_update_examples = "update t_english_info set word_example = '%s' where word = '%s' and word_type_id = %s " % (
                pymysql.escape_string(self.var_examples), self.word, self.word_type_id)
            cursor.execute(sql_update_examples)
            db.commit()
        else:
            sql_update_examples = "update t_english_info set word_example = '  ' where word = '%s' and word_type_id = %s " % (
                self.word, self.word_type_id)
            cursor.execute(sql_update_examples)
            db.commit()


        #下载美音MP3
        if len(self.get_symbol_list)>0:
            self.req_america = requests.get(pymysql.escape_string(self.get_symbol_list[1]))
            if self.req_america.status_code == 404:
                return
            self.filename = '0' +str(self.word_type_id)+ 'America' + pymysql.escape_string(self.get_symbol_list[1]).split('/')[-1]
            self.down_load_dir = 'F:/English_Word_MP3/' + self.filename
            time.sleep(0.3)
            with open(self.down_load_dir, 'wb') as fobj:
                fobj.write(self.req_america.content)
        else:
            return


        #下载英音MP3
        if len(self.get_symbol_list[3])>0:
            self.req_english = requests.get(pymysql.escape_string(self.get_symbol_list[3]))

            if self.req_english.status_code == 404:
                return
            self.filename = '0' + str(self.word_type_id) + 'English' + \
                        pymysql.escape_string(self.get_symbol_list[3]).split('/')[-1]
            self.down_load_dir = 'F:/English_Word_MP3/' + self.filename
            time.sleep(0.3)
            with open(self.down_load_dir, 'wb') as fobj:
                fobj.write(self.req_english.content)
        else:
            return
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值