xpath爬取项目

扇贝爬取

import requests
from lxml import etree
import json

class ShanBei(object):
    def __init__(self,url,result):
        self.url = url
        self.result = result
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    def get_html(self):
        try:
            response = requests.get(url=self.url,headers=self.headers)
            response.raise_for_status()
            html = response.content.decode('utf-8')
            # print(html)
            self.parse_html(html)
        except:
            return False
    def parse_html(self,html):
        xpath_html = etree.HTML(html)
        trs = xpath_html.xpath('//table[contains(@class,"table-bordered")]/tbody/tr')
        for tr in trs:
            item = {}
            word = tr.xpath('./td[@class="span2"]/strong/text()')[0]
            means = tr.xpath('./td[@class="span10"]/text()')[0]
            item[word] = means
            self.result.append(item)
        self.save_to_json(self.result)

    def save_to_json(self,result):
        with open('shanbei.json','w',encoding='utf-8') as fp:
            json.dump(result,fp)

if __name__ == '__main__':
    result = []
    base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page={}'
    for i in range(1,4):
        url = base_url.format(i)
        shanbei = ShanBei(url,result)
        shanbei.get_html()

网易云歌手信息爬取

import requests
from lxml import etree
import json

class WangYiYun(object):
    def __init__(self):
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            'cookie': '_ntes_nnid=bde7c65abd5e06bbf64efe60691ef247,1566111466592; _ntes_nuid=bde7c65abd5e06bbf64efe60691ef247; __oc_uuid=7c038f50-c185-11e9-86f9-4bd3dd23d629; _iuqxldmzr_=32; WM_TID=cN5fXBJlrPpFQBFRFUM5pc68R2vwXqaF; UM_distinctid=16d9b2096ba1f6-0c3fa044fca013-3f385804-15f900-16d9b2096bb321; P_INFO=lovekk0502@163.com|1570539822|0|163|00&99|bej&1570265765&unireg#bej&null#10#0#0|&0|unireg|lovekk0502@163.com; mail_psc_fingerprint=db28de61b0efc44e30fad7491c2519e4; __root_domain_v=.163.com; _qddaz=QD.887vbv.pz0uqc.k1ut3v9f; hb_MA-BFF5-63705950A31C_source=www.baidu.com; JSESSIONID-WYYY=nCM3UFRCHU%2BN29I40JCdWqeUoK9ODAMvhatJvu55zI3N5TkvtqadSbtPdyhfp9%5C3%2BGu3evqCf6R%5ClFUng2S3yPCh92RD6%2FWvhk2AE6GsIoF%2By8m%5CV%5C%5CPIzx31iRMqK9pp%2B2dvcaNfTK5PmfV9kxB94U%5Cmt6k7MR%5C8bc%2BAS50YMZ6ijBN%3A1572501969558; WM_NI=t0uRHExyLcdgXOVHlLxRzq%2F3shycmWJQ15HvrwOf%2FvY%2F78m60wcmaPJFIVETBrWPHkprlp0M%2B53OC9cPIhhiyAFqmCWHOwIxdr9v3%2FltLFblCA1sqZCopftQL4zBBC3TUWw%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeaddc67f79e8a97e55daab08ea6c54e969f9baeb87f86b88fbbce7cad9f81a3d22af0fea7c3b92ab8f0bbb3cb4fbca78484e76d9c919aa9ef48bc8eb692ae39b1b8b784f05d8a8fffd6cb7eb2a98187ae7e9bf1f9d9f03ef4a9ad8ee45c8a9a87b7c86082ae99b1cb61b4aea491b347e9ebfdb0c233a1908789eb63bb9f8a92f04995988198cc74a8aef797b225b0ac84a7e168819fff94f45281b1c087f75b90ecb6a3c149f18c9bb8f637e2a3',
            'upgrade-insecure-requests': '1'

        }
    def get_html(self,url):
        response = requests.get(url=url,headers=self.headers)
        html = response.content.decode('utf-8')
        xpath_html = etree.HTML(html)
        return xpath_html

    ## 进入歌手主页,获取歌手详细信息
    def parse_detail(self,url,item):
        html = self.get_html(url)
        description = html.xpath('//div[@class="n-artdesc"]/p/text()')
        item['description'] = ''.join(description).replace('\n','')
        self.save_to_json(item)


    ## 解析每个歌手的姓名和url
    def parse_singer(self,url):
        item = {}
        html = self.get_html(url)
        singer_name = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/text()')
        singer_url = html.xpath('//ul[@id="m-artist-box"]/li/p/a[1]/@href|//ul[@id="m-artist-box"]/li/a[1]/@href')
        singer_url = [x.strip() for x in singer_url]
        for index,name in enumerate(singer_name):
            item['singer_name'] = name
            new_url = 'https://music.163.com' + singer_url[index]
            item['singer_url'] = new_url
            ## 艺人介绍的url
            url = 'https://music.163.com' + singer_url[index].split('?')[0]+'/desc?'+singer_url[index].split('?')[1]
            self.parse_detail(url,item)
            # print(url)



    ## 抓取按字母分类的url
    def parse_zimu(self,url):
        html = self.get_html(url)
        zimu_url = html.xpath('//ul[@id="initial-selector"]/li/a/@href')
        for one_url in zimu_url:
            new_url = 'https://music.163.com' + one_url
            self.parse_singer(new_url)


    ### 获取首页的内容,抓取地区分类的url
    def parse_index(self,url):
        html = self.get_html(url)
        area_url = html.xpath('//div[@class="blk"]/ul/li/a/@href')
        for one_url in area_url:
            new_url = 'https://music.163.com' + one_url
            self.parse_zimu(new_url)


    def save_to_json(result):
        with open('singer.json','w',encoding='utf-8') as fp:
            json.dump(result,fp)
if __name__ == '__main__':
    base_url = 'https://music.163.com/discover/artist'
    wangyiyun = WangYiYun()
    wangyiyun.parse_index(base_url)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以为您解答这个问题。 首先,我们需要创建一个Scrapy项目,使用命令`scrapy startproject kaoyan`,其中"kaoyan"为项目名称。然后在项目下创建一个Spider,使用命令`scrapy genspider kaoyan_spider kaoyan.com`,其中"kaoyan_spider"为Spider名称,"kaoyan.com"为需要爬取的网站域名。 接下来,在Spider中实现xpath翻页爬取。具体代码如下: ```python import scrapy class KaoyanSpider(scrapy.Spider): name = 'kaoyan_spider' allowed_domains = ['kaoyan.com'] start_urls = ['https://www.kaoyan.com/news/'] def parse(self, response): # 实现xpath爬取 data = response.xpath("你需要爬取的数据的xpath表达式") # 处理爬取到的数据 yield { "data": data } # 实现翻页 next_page = response.xpath("下一页的xpath表达式").get() if next_page: next_url = response.urljoin(next_page) yield scrapy.Request(next_url, callback=self.parse) ``` 在代码中,我们首先实现了xpath爬取,将爬取到的数据通过yield语句传给管道进行处理。然后,我们实现了翻页部分,找到下一页的xpath表达式并使用response.urljoin()方法构造下一页的URL,再使用Scrapy.Request()方法发送请求并指定回调函数为parse,从而实现翻页爬取。 需要注意的是,Scrapy框架已经实现了一些常见的翻页方法,例如使用LinkExtractor实现翻页,使用CrawlSpider继承类实现翻页等等。因此,在实际开发中可以根据具体情况选择最适合的翻页方法。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值