关于我为了看懂技术文档而爬英语技术文档的单词这件事-CSDN博客

本文链接：https://blog.csdn.net/HuangJiaxinZ/article/details/127816232

想法来源

之前，应该是看了《大话设计数据结构》作者在书中说过（应该是这本书名，如果不是，抱歉，我没记名字的习惯），为了逼自己学英语，爬取英语网站的单词，把英语网站常用的单词，按出现的次数排序，越高的排前面，这也可以专注出现频率高的单词。确实是一个不错的想法，也想着自己抽空实现了一下。

而我以下实现的，并没有统计其出现频率，而是通过简单的爬取英语单词，然后过滤掉一些简单的单词，初步做一个简单版本。其实主要精力还是花在过滤专业名词和比较简单的小学中学级别的单词。

代码实现

文件名：英语技术文档爬虫翻译.py

import requests  
from bs4 import BeautifulSoup  
import re  
import 百度翻译api as baidu  
  
resultList=[]  
  
# 简单的单词或者专业词汇，过滤掉，笨方法，都是手动添加的  
xxs_list=['this', 'is', 'the', 'spring', 'framework', 'of', 'part', 'that', 'are', 'to', 'these', 'container',  
          'has', 'its', 'own', 'which', 'easy', 'understand', 'and', 'in', 'java', 'programming', 'with', 'most',  
          'also', 'chapter', 'objects', 'define', 'other', 'they', 'work', 'only', 'through', 'a', 'method', 'or',  
          'set', 'instance', 'it', 'is', 'from', 'factory', 'name', 'bean', 'itself', 'its', 'using', 'classes',  
          'such', 'as', 'will', 'jvm', 'https', 'typed', 'script', 'ask', 'because', 'tag', 'int', 'long',  
          'however', 'but', 'node', 'session', 'range', 'choice', 'therefore', 'metadata', 'referred', 'mvc',  
          'complete',  
          'examples', 'tokens', 'short', 'mostly', 'jmx', 'JavaConfig', 'classpath', 'coding', 'themselves',  
          'oracle', 'six', 'due', 'editors', 'know', 'due', 'time', 'component-scanning', 'initial', 'succeed',  
          'groovy', 'flag',  
          'important', 'number', 'final', 'synchronously', 'parameters', 'unit', 'configured', 'design', 'so',  
          'value', 'method', 'tostring',  
          'ever', 'loser', '*', 'qa', 'jaxb2', 'valueof', 'supports', 'must', 'plain', 'becomes', 'querying',  
          'difficulty', 'files', 'arises',  
          '=', 'inner', 'bean’s', 'conditions', 'reasons', 'bytes', 'few', 'servers', 'sees', 'proper', 'lt',  
          'tied', 'pattern', 'note', 'get', 'don’t',  
          'autowiring', 'integer', 'date', 'top', 'static', 'parse', 'learn', 'registry', 'learn', 'filter',  
          'issue', 'logging', 'handler', 'simple',  
          'di', 'false', 'url', 'bring', 'welcomes', 'normal', 'check', 'call', 'lead', 'entity', 'keeping',  
          'beanfactory', 'xml', 'spring-web', 'than',  
          'sizes', 'list', 'how', 'rich', 'did', 'micro', 'trouble', 'between', 'everywhere', 'getters', 'looks',  
          'exist', 'little', 'shows', 'auto', 'socket',  
          'common', 'found', 'now', 'nothing', 'copy', 'nonnull', 'respect', 'languages', 'visible', 'failures',  
          'aop', 'lets', 'never', 'once', 'places',  
          'shutdown', 'population', 'helps', 'p', 'args', 'verify', 'otherwise', 'try', 'else', 'expect', 'smart',  
          'performs', 'username', 'singletons',  
          'some', 'process', 'driver', 'spaces', 'destroy', 'an', 'scope', 'safely', 'unless', 'answer', 'makes',  
          'symbol', 'times', 'iterable', 'drop',  
          'first', 'fress', 'powerful', 'size', 'beans', 'valued', 'live', 'hot', 'strings', 'condition', 'data',  
          'birth', 'active', 'ready', 'worry', 'namespace',  
          'reverse', 'buffers', 'longer', 'scanner', 'basic', 'last', 'r', 'offer', 'quickly', 'styles', 'event',  
          'begins', 'startup', 'build', 'add', 'codes', 'error',  
          'fast', 'stopping', 'works', 'vm', 'api', 'left', 'refresh', 'le', 'true', 'root', 'function', 'yes',  
          'null', 'enums', 'topics', 'better', 'libraries', 'handle',  
          'storing', 'self', 'intellij', 'field', 'running', 'distinct', 'ad', 'super', 'ignores', 'yet', 'ejb',  
          'level', 'having', 'rules', 'prefer', 'pre', 'binding',  
          'arg', 'case', 'exports', 'what', 'non', 'listener', 'argue', 'waiting', 'helpful', 'sun', 'option',  
          'initialize', 'random', 'by', 'jdk', 'sa', 'onlyjdk', 'advice',  
          'setting', 'put', 'master', 'cased', 'understands', 'page', 'testbean', 'compete', 'different', 'dowork',  
          'look', 'days', 'mean', 'merge', 'on', 'color', 'happen',  
          'pop', 'red', 'ra', 'sub', 'nice', 'cannot', 'copy', 'returning', 'callback', 'jre', 'fail', 'sleep',  
          'abstract', 'depend', 'compile', 'layout', 'creating', 'repeat',  
          'n', 'regular', 'exits', 'note', 'hair', 'private', 'twice', 'found', 'auto', 'common', 'socket',  
          'functions', 'plays', 'default', 'filters', 'll', 'low', 'implement', 'index',  
          'another', 'gt', 'show', 'import', 'bytepe', 'developer', 'close', 'javax', 'year', 'arrays', 'arrays',  
          'packages', 'passwords', 'proxying', 'go', 'solution',  
          'support', 'double', 'deadlock', 'keys', 'listen', 'detail', 'x', 'serializable', 'enables', 'methods',  
          'classname', 'jcache', 'safe', 'input', 'server', 'updated',  
          'mark', 'maxsize', 'names', 'small', 'interface', 'extends', 'reason', 'mybean', 'useful', 'public',  
          'step', 'ejbs', 'yyyymmdd', 'direct', 'net', 'new', 'response',  
          'extent', 'read', 'exp', 'foo', 'smell', 'parent', 'tie', 'project', 'json', 'directory', 'websocket',  
          'iso', 'linkedlist', 'inject', 'no', 'string', 'load', 'maven', 'void',  
          'file', 'proxy', 'people', 'changes', 'apps', 'away', 'computed', 'sweet', 'boot', 'tool', 'able',  
          'further', 'after', 'insert', 'update', 'ee', 'minxin', 'access', 'concat',  
          'customer', 'util', 'you', 'key', 'status', 'enough', 'do', 'adding', 'logic', 'said', 'impossible',  
          'doing', 'target', 'dao', 'begin', 'byte', 'based', 'role', 'test', 'view', 'port',  
          'tx', 'have', 'think', 'files', 'search', 'controller', 'problems', 'someone', 'core', 'github', 'letter',  
          'join', 'adds', 'see', 'tools', 'changed', 'kind', 'nullable', 'contexts',  
          'second', 'for', 'task', 'fully', 'lazy', 'change', 'main', 'write', 'template', 'really', 'counts',  
          'human', 'span', 'b', 'messages', 'waits', 'substring', 'code', 'differences',  
          'fairly', 'library', 'course', 'beanname', 'windows', 'single', 'our', 'just', 'spi', 'security', 'local',  
          'netty', 'builder', 'application', 'kotlin', 'id', 'allows', 'here',  
          'replace', 'o', 'offline', 'act', 'line', 'constants', 'systems', 'telling', 'good', 'thing', 'up',  
          'doesn', 'parser', 'sort', 'match', 'eq', 'resource', 'bootstrap', 'save',  
          'etc', 'zero', 'skip', 'zip', 'good', 'style', 'prints', 'package', 'database', 'org', 'app', 'env',  
          'hold', 'zip', 'creator', 'happened', 'abean', 'three', 'require', 'lib', 'hello',  
          'gb', 'sql', 'div', 'correct', 'issues', 'care', 'trying', 'pool', 's', 'retry', 'label', 'less', 'been',  
          'orm', 'basics', 'en', 'jndi', 'cache', 'aspectj', 'disable', 'end', 'enable',  
          'reflect', 'choose', 'who', 'us', 'need', 'xyz', 'every', 'at', 'their', 'we', 'g', 'make', 'out', 'dsl',  
          'web', 'sign', 'via', 'city', 'log', 'include', 'unlock', 'unknown', 'mix',  
          'books', 'country', 'box', 'then', 'choose', 'wrong', 'correct', 'published', 'take', 'needs', 'return',  
          'example', 'configuration', 'be', 'e', 'type', 'throw', 'jpa', 'while',  
          'includes', 'et', 'min', 'testing', 'let', 'turn', 'u', 'like', 'world', 'ioc', 'too', 'console', 'two',  
          'okay', 'nio', 'early', 'future', 'environment', 'whys', 'both', 'later',  
          'white', 'very', 'may', 'term', 'readers', 'class', 'boolean', 'values', 'move', 'carry', 'none', 'old',  
          'scan', 'http', 'idea', 'both', 'paths', 'stop', 'jars', 'ret', 'pojo',  
          'init', 'decided', 'ge', 'negative', 'absolute', 'best', 'lower', 'info', 'buffer', 'popular', 'intend',  
          'version', 'solve', 'dd', 'output', 'object', 'i', 'options', 'keeps',  
          'myapp', 'your', 'way', 'exists', 'config', 'mm', 're', 'wiki', 'com', 'asking', 'knows', 'all', 'route',  
          'where', 'op', 'map', 'special', 'benefit', 'five', 'should',  
          'anywhere', 'large', 'txt', 'bad', 'possible', 'cglib', 'don', 'lookup', 'door', 'starts', 'effective',  
          'avoid', 'mock', 'z', 'enum', 'resolve', 'without', 'lifecycle',  
          'age', 'wish', 'find', 'asked', 'leaving', 'override', 'variable', 'took', 'one', 'ways', 'finally',  
          'allowed', 'were', 'context', 'maximum', 'likely', 'html', 'imported',  
          'takes', 'path', 'place', 'find', 'each', 'accept', 'run', 'war', 'apache', 'display', 'can', 'primary',  
          'produce', 'denote', 'ease', 'source', 'meta', 'saw', 'content',  
          'resources', 'manages', 'filesystem', 'extend', 'sent', 'os', 'alone', 'element', 'mock', 'tolowercase',  
          'problem', 'keep', 'ignored', 'extended', 'allow', 'responses',  
          'any', 'datasource', 'full', 'thymeleaf', 'whether', 'avoid', 'often', 'focus', 'soft', 'them', 'declare',  
          'tip', 'environments', 'webflux', 'ignore', 'cookie', 'logs', 'down',  
          'accepts', 'help', 'commit', 'src', 'compute', 'connection', 'means', 'fork', 'least', 'follow',  
          'uniquely', 'lifecycle', 'jsr', 'jsps', 'concurrency', 'components', 'next',  
          'debug', 'crud', 'generally', 'post', 'committed', 'use', 'stream', 'lang', 'request', 'four', 'browser',  
          'switch', 'chain', 'exclude', 'flags', 'rebuild', 'readonly',  
          'lambdas', 'simplify', 'ability', 'become', 'err', 'serves', 'does', 'parameter', 'completely',  
          'importance', 'large', 'listing', 'items', 'focusing', 'upload', 'raw',  
          'share', 'everything', 'interfaces', 'assert', 'analyze', 'why', 'difficult', 'png', 'clean', 'catch',  
          'force', 'logo', 'my',  
          'sometime', 'extract', 'tom', 'oop', 'xxx', 'together', 'jsf', 'title', 'ws', 'header', 'memory',  
          'document', 'remote', 'login', 'except', 'rows',  
          'table', 'lru', 'there', 'listeners', 'job', 'al', 'team', 'uri', 'el', 'mac', 'slf', 'finish',  
          'yourself', 'until', 'person', 'user', 'length', 'jnditemplate',  
          'already', 'async', 'd', 'anyway', 'balance', 'nature', 'plugin', 'io', 'egg', 'program', 'improve',  
          'initializingbean', 'clear', 'advanced', 'email', 'included', 'within', 'ids',  
          'host', 'redis', 'mixin', 'thread', 'se', 'finishes', 'reader', 'blog', 'j', 'over', 'always', 'difference',  
          'errors',  
          'select', 'roots', 'channel', 'comp', 'managedbean', 'docker', 'classpathresource', 'scanning', 'jmock',  
          'goes', 'nonnullapi',  
          'threadlocal', 'print', 'mycommand', 'beantype', 'javaconfig', 'lts', 'jsonpath', 'fire', 'spis',  
          'dateformatter', 'questions',  
          'respects', 'lose', 'looking', 'stops', ''  
          ]  
  
  
def get_word_list_from_doc(urlList):  
  
    headers={  
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}  
    set1=set()  
  
    # 遍历爬取urlList里面的链接  
    for url in urlList:  
        html_text=requests.get(url, headers=headers).text  
        bsObj=BeautifulSoup(html_text, features='html.parser')  
        # 解析所有p标签  
        p_list=bsObj.findAll("p")  
        pat='[a-zA-Z]+'  
        for p in p_list:  
            text=p.get_text()  
            # 通过正则表达式找出英语单词  
            word_list=re.findall(pat, text)  
            for word in word_list:  
                # 爬取结果发现，有些长度低的单词（有些其实不是单词）没有什么意义，而长度过长的，可能是一个变量的命名或者专业名词，直接在这一层过滤掉  
                if 3<len(word)<20:  
                    # set带有去重的功能  
                    set1.add(word.lower())  
  
    # 过滤掉简单的单词，利用差集的概念  
    set1=set1.difference(set(xxs_list))  
  
    # print(set)  
    # set 转 list    l1=list(set(set1))  
    return l1  
  
  
# 按batch_size分割list  
# [1,2,3,4] batch_size=2  
# => [[1,2],[3,4]]  
def split_list(list, batch_size):  
    list=[list[i:i+batch_size] for i in range(0, len(list), batch_size)]  
    return list  
  
  
if __name__=='__main__':  
    urlList=['https://docs.spring.io/spring-framework/docs/current/reference/html/core.html#spring-core',  
             'https://docs.spring.io/spring-framework/docs/current/reference/html/testing.html#testing',  
             'https://docs.spring.io/spring-framework/docs/current/reference/html/overview.html#overview',  
             'https://kafka.apache.org/documentation/#gettingStarted']  
  
    l1=get_word_list_from_doc(urlList)  
  
    print(l1)  
    print("单词数量为："+len(l1))  
    # 百度翻译一次查询数量有限制  
    sl=split_list(l1, 300)  
  
    for l in sl:  
        resultList.extend(baidu.queryList(l))  
    # 打印带翻译的单词，自己用json格式化一下就可以看了  
    print(resultList)

文件名：百度翻译.py

注意，下列代码使用的百度翻译的sdk，请修改成自己的appid和appkey

不懂的自己搜索以下教程，关键字：百度翻译开放平台注册教程

# 说明:demo都是百度翻译开放平台下载的代码稍微修改的  
  
import requests  
import random  
from hashlib import md5  
  
# 请修改成为自己的appid和appkey，如果没有，请搜索：百度翻译开放平台自己注册，接口是免费的  
# http://api.fanyi.baidu.com/api/trans/product/desktop  
appid=''  # 添加自己的id
appkey=''  # 添加自己的key
endpoint = 'http://api.fanyi.baidu.com'  
path = '/api/trans/vip/translate'  
url = endpoint + path  
  
# Generate salt and sign  
def make_md5(s, encoding='utf-8'):  
    return md5(s.encode(encoding)).hexdigest()  
  
  
def query(q,from_lang,to_lang):  
    salt=random.randint(32768, 65536)  
    sign=make_md5(appid+q+str(salt)+appkey)  
    # Build request  
    headers={'Content-Type': 'application/x-www-form-urlencoded'}  
    payload={'appid': appid, 'q': q, 'from': from_lang, 'to': to_lang, 'salt': salt, 'sign': sign}  
  
    # Send request  
    r=requests.post(url, params=payload, headers=headers)  
    result=r.json()  
  
    # print(result['trans_result'])  
    '''   
    list   
    [  
        {            "src": "Hello World! This is 1st paragraph.",            "dst": "你好，世界！这是第一段。"  
        },        {            "src": "This is 2nd paragraph.",            "dst": "这是第二段。"  
        }    ]    '''    return result['trans_result']  
  
# ql,查询的单词list，默认是英语翻译成中文，如有其他需要，自己设置以下就好了  
def queryList(ql,from_lang='en',to_lang='zh'):  
    q = ''  
    for w in  ql:  
        q += (w + '\n')  
    # print(query)  
    return query(q,from_lang,to_lang)  
  
  
# 单元测试  
if __name__ == '__main__':  
    # q='Hello World! This is 1st paragraph.\nThis is 2nd paragraph.'  
    # query(q)    list1 = ['Hello World! This is 1st paragraph.','This is 2nd paragraph.']  
    print(queryList(list1),'en','zh')