项目实训 爬取单词数据构建单词库

项目的AI口语助手决定需要搭建自己的单词库,本周完成了单词的爬虫,共计5万多单词,包含了单词的拼写,翻译,读音,例句,例句翻译。为了提高爬虫效率,代码中同时并行了11个线程进行爬取。2个小时左右就爬取到全部数据。为了克服目标网站的反爬,设置了多个user-agent模拟访问:项目使用scrapy框架,完成对目标网站的解析,爬取数据。将爬取的单词内容存入json里,音频内容存成mp3形式。完整代码:

#!/usr/bin/env python3
import requests
import json
import os
import uuid
from lxml import etree
from multiprocessing import Process
import random


user_agent = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
	"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",

]


def get_user_agent():
    return random.choice(user_agent)

class ScrapyProcess(Process):

    def __init__(self, file_name):
        super(ScrapyProcess, self).__init__()
        self.file_name = file_name

    def read_file(self):
        with open(self.file_name + '.txt', 'r', encoding='utf-8') as f:
            for line in f:
                yield line[:-1]

    def download_file(self, url, path):
        res = requests.get(url)
        with open(path, 'wb') as f:
            f.write(res.content)

    def connect_file(self, file_name1, file_name2, file_name3):
        file1 = open(file_name1, 'rb')
        file2 = open(file_name2, 'rb')
        file3 = open(file_name3, 'wb')
        file3.write(file1.read())
        file3.write(file2.read())
        file1.close()
        file2.close()
        file3.flush()
        file3.close()
        os.remove(file_name1)
        os.remove(file_name2)

    def is_in(self, key, dict_list):
        for item in dict_list:
            if key in item.keys():
                return True
        return False


    def scrapy(self, word):

        headers = {
        
        "User-Agent": get_user_agent(),
        
        }

        word_info = {}
        url = 'http://www.91dict.com/words?w=' + word
        res = requests.get(url,headers=headers)
        res.encoding = 'utf-8'
        data = etree.HTML(res.text)
        if data.xpath('/html/body/div[2]/section[2]/div/div/div/div[1]/div[1]/p/text()'):

            # 单词

            word_info['word'] = data.xpath('/html/body/div[2]/section[2]/div/div/div/div[1]/div[1]/p/text()')[0]

            try:
                pronounce_element = data.xpath('/html/body/div[2]/section[2]/div/div/div/div[1]/div[1]/div[1]/span/audio')[0]
                pron_url = pronounce_element.get('src')  # 获取音频的 URL
                if pron_url:
                    file_path = './audio/%s.mp3' % (word_info['word'])
                    self.download_file(pron_url,file_path)
                else:
                    print("没有对应的url", word_info['word'])
            except IndexError:
                print("没有对应的元素", word_info['word'])

            train = []
            for item in filter(lambda x: x != '', map(lambda x: x.replace('\n', ''),
                                                      data.xpath("//*[@class='listBox']/text()"))):
                if len(item.split('. ')) == 1:
                    train.append({'': item.split('. ')[0]})
                elif len(item.split('. ')) == 2 and not item.startswith('=') and not self.is_in(item.split('. ')[0], train):
                    train.append({item.split('. ')[0]: item.split('. ')[1]})
            word_info['tran'] = train

            # 例子
            example = []
            example_len = len(data.xpath(
                "//*[@class='flexslider flexslider_2']/ul/li/div[@class='imgMainbox']"))
            # 例句
            sens = data.xpath("//*[@class='mBottom']")
            # 例句范意思
            sen_trains = data.xpath("//*[@class='mFoot']/text()")
            origins = list(filter(lambda x: x != '\n', data.xpath(
                "//*[@class='mTop']/text()")))
            # 下文内容及翻译
            next_sens = data.xpath(
                "//*[@class='mTextend']/div[2]/div[2]/p[1]/text()")
            next_sen_trains = data.xpath(
                "//*[@class='mTextend']/div[2]/div[2]/p[2]/text()")
            next_pron_urls = data.xpath("//*[@class='viewdetail']/@href")
            for i in range(example_len):
                sen = etree.tostring(
                    sens[i], encoding='utf-8')[22:-7].decode('utf-8')
                sen_train = sen_trains[i][1:]
                example.append({
                    'origin': origins[i][1:-1],
                    "sen": sen,
                    'sen_tran': sen_train,
                })
            word_info['example'] = example
            return word_info

    def main(self):
        for word in self.read_file():
            print(word)
            self.save(self.scrapy(word))

    def save(self, word_info):
        with open(self.file_name + '.json', 'a', encoding='utf-8') as f:
            if word_info:
                json.dump(word_info, fp=f, indent=4, ensure_ascii=False)
                f.write(',\n')

    def run(self):
        self.main()


if __name__ == "__main__":
    for i in range(1, 12):
        p = ScrapyProcess('./words/' + str(i))
        # 启动子进程
        p.start()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值