1.[文件] WebDict.py ~ 4KB
#----------------------------------------------------------------
# -*- coding: utf-8 -*-
#!/usr/bin/env python
#----------------------------------------------------------------
# Author : Scharfsinnig
#
# E-Mail : scharfsinnig@163.com
#
# File : WebDict.py
#
# Introduction:
# 脚本主要是对有道网的单词查询网页,进行简单的信息提取任务,找到
# 单词的翻译结果。方便自己在终端下对不认识的单词进行查询。
#----------------------------------------------------------------
import re
import time
import thread
import urllib
import urllib2
from urllib import quote
class Spider_Youdao:
#初始化
def __init__(self):
#有道网页翻译段
self.Trans_Youdao_Tag = re.compile(r'\s?
.*?\s?')#21世纪大词典段
self.Trans_Shiji_Tag = re.compile(r'\s?.*?')
#退出标志
self.run = True
#获得查询的单词
def SearchWord(self):
S_Word = raw_input("\n#[输入单词]\n>")
return S_Word
#得到URL
def GetUrl(self):
SWord = self.SearchWord()
#加上查询的单词以后
if quote(SWord) == SWord:
MyUrl = "http://dict.youdao.com/search?len=eng&q="+quote(SWord)+"&keyfrom=dict.top"
return MyUrl
#获得页面
def GetPage(self):
#获取URL
Youdao_Url = self.GetUrl()
#伪装成浏览器请求
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(Youdao_Url, headers = headers)
Res = urllib2.urlopen(req)
#将其他编码的字符串转换成unicode编码
ResultPage = Res.read().decode("utf-8")
#ResultPage = Res.read()
return ResultPage
#开始提取网页中的信息
def ExtractPage(self):
#获得页面
MyPage = self.GetPage()
#提取有道的基本翻译
YoudaoTrans = self.Trans_Youdao_Tag
#提取21世纪词典的翻译
ShijiTrans = self.Trans_Shiji_Tag
print "--------------------------------------------"
YouDaoTrans = self.Trans_Youdao_Tag
TransYdIterator = YouDaoTrans.finditer(MyPage)
print "#(翻译来自有道词典):"
myItems = re.findall('
(.*?)for item in myItems:
YDTmp = item
TransYdIterator = YouDaoTrans.finditer(YDTmp)
for iterator in TransYdIterator:
YouDao = iterator.group()
YDTag = re.compile('\s?<.*?>')
print YDTag.sub('',YouDao)
print "--------------------------------------------"
TransSjIterator = ShijiTrans.finditer(MyPage)
print "#(翻译来自21世纪大词典):"
for iterator in TransSjIterator:
ShiJi = iterator.group()
SJTag = re.compile('\s?<.*?>')
print SJTag.sub('',ShiJi)
print "--------------------------------------------"
#启动爬虫
def Start(self):
while self.run:
S_Word = raw_input("\n#[\"!\"号退出.回车继续.]\n>")
if S_Word != "!":
self.ExtractPage()
#thread.start_new_thread(self.ExtractPage,())
#time.sleep(5)
else:
self.run = False
if __name__ == '__main__':
mydict = Spider_Youdao()
mydict.Start()