爬的单词信息
# -*- coding: utf-8 -*-
import openpyxl
import pymysql
import json
import sys
from urllib.parse import urlparse, quote, urlencode, unquote
from urllib.request import urlopen
import re
import urllib.request
from lxml import etree
import re
import time
import requests
from functools import reduce
db = pymysql.connect("localhost", "root", "123456", "STUDY", charset="utf8")
cursor = db.cursor()
#形参为单词、单词类型(PET/Unlock3/PET.....)
class Phrase():
def __init__(self,word,word_type_id):
self.word = word
self.word_type_id = word_type_id
if self.word_type_id == 2:
self.word_type = 'Unlock3'
elif self.word_type_id == 1:
self.word_type = 'PET'
elif self.word_type_id == 3:
self.word_type = '中考短语'
elif self.word_type_id == 4:
self.word_type = 'FCE'
#插入英语单词信息表t_english_info
def load_word_mysql(self):
sql_del = "delete from t_english_info where word = '%s' and word_type_id =%s"%(self.word,self.word_type_id)
cursor.execute(sql_del)
db.commit()
sql ="select max(w_id) from t_english_info where word_type_id = %s "%(self.word_type_id)
cursor.execute(sql)
db.commit()
data_max_wid = cursor.fetchall()
self.word_id = int(data_max_wid[0][0]) + 1
sql_insert_word = "insert into t_english_info(w_id,word,word_type_id,word_type) values(%s,'%s',%s,'%s')" % (self.word_id,self.word,self.word_type_id,self.word_type)
cursor.execute(sql_insert_word)
db.commit()
#获取中文释义并更新到数据库中
def get_word_cn(self):
self.query = {'q': "".join(self.word)} # list --> str: "".join(list)
self.url = 'https://fanyi.youdao.com/openapi.do?keyfrom=11pegasus11&key=273646050&type=data&doctype=json&version=1.1&' + urlencode(self.query)
self.response = urlopen(self.url, timeout=3)
self.html = self.response.read().decode('utf-8')
self.d = json.loads(self.html)
if self.d.get('errorCode') == 0:
if self.d.get('basic'):
self.explains = self.d.get('basic').get('explains')
self.result_cn = str(self.explains).replace('\'', "").replace('[', "").replace(']', "")
else:
self.result_cn = "未查到释义"
else:
self.result_cn = '未查到释义'
sql_update_cn = "update t_english_info set word_c = '%s' where word = '%s' and word_type_id = %s "%(self.result_cn,self.word ,self.word_type_id)
cursor.execute(sql_update_cn)
db.commit()
#获取音标和读音
def get_symbol_voice(self):
self.searchurl = 'http://cn.bing.com/dict/search?q='+self.word.replace(' ', '%20')
self.response = urllib.request.urlopen(self.searchurl)
self.html = self.response.read()
self.get_symbol_list = []
self.symbol_xpath = '/html/body/div[1]/div/div/div[1]/div[1]/div[1]/div[2]/div'
self.reobj1 = re.compile("(https\:.*?mp3)", re.I | re.M | re.S)
self.selector = etree.HTML(self.html.decode('utf-8'))
self.get_symbol_tmp = self.selector.xpath(self.symbol_xpath)
for item in self.get_symbol_tmp:
it = item.xpath('div')
if len(it) > 0:
ddd = self.reobj1.findall(it[1].xpath('a')[0].get('onmouseover', None))
self.get_symbol_list.append(it[0].text)
self.get_symbol_list.append(ddd[0])
ddd = self.reobj1.findall(it[3].xpath('a')[0].get('onmouseover', None))
self.get_symbol_list.append(it[2].text)
self.get_symbol_list.append(ddd[0])
if len(self.get_symbol_list) > 0:
sql_update_symbol = "update t_english_info \
set America_sign = '%s', \
America_voice = '%s', \
English_sign = '%s', \
English_voice = '%s' \
where word = '%s' \
and word_type_id =%s " % (pymysql.escape_string(self.get_symbol_list[0]),pymysql.escape_string(self.get_symbol_list[1]), \
pymysql.escape_string(self.get_symbol_list[2]),pymysql.escape_string(self.get_symbol_list[3]),self.word, self.word_type_id)
cursor.execute(sql_update_symbol)
db.commit()
else:
sql_update_symbol = "update t_english_info \
set America_sign = ' ', \
America_voice = ' ', \
English_sign = ' ', \
English_voice = ' ' \
where word = '%s' \
and word_type_id =%s " % (self.word, self.word_type_id)
cursor.execute(sql_update_symbol)
db.commit()
# 获取释义
self.examples = []
self.get_example_en = self.selector.xpath('//*[@class="val_ex"]')
self.get_example_cn = self.selector.xpath('//*[@class="bil_ex"]')
get_len = len(self.get_example_en)
for i in range(get_len):
self.examples.append("%s||%s" % (self.get_example_en[i].text, self.get_example_cn[i].text))
if len(self.examples) > 0:
self.var_examples = reduce(lambda x, y: "%s||||%s" % (x, y), self.examples).replace('||||',
'\n').replace('||',
' ')
sql_update_examples = "update t_english_info set word_example = '%s' where word = '%s' and word_type_id = %s " % (
pymysql.escape_string(self.var_examples), self.word, self.word_type_id)
cursor.execute(sql_update_examples)
db.commit()
else:
sql_update_examples = "update t_english_info set word_example = ' ' where word = '%s' and word_type_id = %s " % (
self.word, self.word_type_id)
cursor.execute(sql_update_examples)
db.commit()
#下载美音MP3
if len(self.get_symbol_list)>0:
self.req_america = requests.get(pymysql.escape_string(self.get_symbol_list[1]))
if self.req_america.status_code == 404:
return
self.filename = '0' +str(self.word_type_id)+ 'America' + pymysql.escape_string(self.get_symbol_list[1]).split('/')[-1]
self.down_load_dir = 'F:/English_Word_MP3/' + self.filename
time.sleep(0.3)
with open(self.down_load_dir, 'wb') as fobj:
fobj.write(self.req_america.content)
else:
return
#下载英音MP3
if len(self.get_symbol_list[3])>0:
self.req_english = requests.get(pymysql.escape_string(self.get_symbol_list[3]))
if self.req_english.status_code == 404:
return
self.filename = '0' + str(self.word_type_id) + 'English' + \
pymysql.escape_string(self.get_symbol_list[3]).split('/')[-1]
self.down_load_dir = 'F:/English_Word_MP3/' + self.filename
time.sleep(0.3)
with open(self.down_load_dir, 'wb') as fobj:
fobj.write(self.req_english.content)
else:
return