使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科(扇贝单词)

'''
爬取扇贝网python必背词汇表
接口地址:https://www.shanbay.com/wordlist/110521/232414/
要求:获取所有的python词汇数据,形成字典,然后存储数据

    思路:
        第一页:https://www.shanbay.com/wordlist/110521/232414/
        第二页:https://www.shanbay.com/wordlist/110521/232414/?page=2

'''

import requests,re,os,json
from lxml import etree

class Shanbei:
    def __call__(self, *args, **kwargs):
        self.get_xml(*args)

    def get_xml(self,pages):
        '''
        获取扇贝英语python页 内容 转换为xml
        :return:获取扇贝英语python页的xml
        '''
        base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page='

        for page in range(1,pages+1):
            base_html = requests.get('https://www.shanbay.com/wordlist/110521/232414/?page=1').text
            base_xml = etree.HTML(requests.get(base_url+str(page)).text)
            pages = page

            # print(base_xml)

            with open('贝壳应英语.html','w',encoding='utf-8') as f :
                f.write(base_html)
            self.get_python(base_html,base_xml,pages)


    def get_python(self,base_html,base_xml,pages):
        '''
        处理python页面数据
        :param base_xml:
        :return:
        '''
        #获取标题
        # main_title = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][1]/div[@class='span8']/div[@class='well']/div[@class='row']/div[@class='span6']/h4/text()")
        #胆子获取
        worlds_rule = re.compile(r'<h4>(.*?)</h4>',re.S)
        main_title = worlds_rule.findall(base_html)

        get_words = re.compile(r'[a-z][A-Z]+',re.I)
        world_title = (get_words.findall(main_title[0]))[0]

        # print(base_html)
        print(world_title)

        # #获取本页词条数量
        trs_length = len(base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row']"))
        print(trs_length)

        #准备大字典
        shell_word_dict = {
            f'必备单词{world_title}':{},
        }

        #根据词条数量获取循环次数
        for line in range(int(trs_length)):
            word = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span2']/strong/text()".format(line+1))
            word_description = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span10']/text()".format(line+1))
            # print(word,word_description)

            shell_word_dict.get(f'必备单词{world_title}')[word[0]] = word_description[0]
            print(shell_word_dict)

        #储存文件 注意:此处保存的是循环外的字典
        path = './扇贝单词/'+'第'+str(pages)+'页' +'/'
        print(path)
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = path+'第'+str(pages)+'页'

        with open(file_path,'w',encoding='utf-8')as f:
            f.write(json.dumps(shell_word_dict))


if __name__ == '__main__':
    # pages = int(input('请输入需要爬取的页数'))
    pages = 3
    shanbei = Shanbei()
    shanbei(pages)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值