爬虫数据解析(3个爬虫示例)

(1)(定位标签)

(2)(提取数据)

 

1.将糗图百科中前5页的图片进行下载

import requests
from urllib import request
from lxml import etree


headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}

url = 'https://www.qiushibaike.com/pic/page/{}/?s=5196770'
for page in range(1, 6):
    if page == 1:
        new_url = 'https://www.qiushibaike.com/pic'
    else:
        new_url = url.format(page)

    page_text = requests.get(url=new_url, headers=headers).text

    tree = etree.HTML(page_text)


    div_list = tree.xpath('//div[@id="content-left"]/div')

    for div in div_list:
        img_name = div.xpath('./div[2]/a/img/@alt')[0]
        img_url = 'https:' + div.xpath('./div[2]/a/img/@src')[0]
        img_path = './day118/01/' + img_name + '.jpg'
     # 用request(不是requests)的这个方法可以直接下载保存文件,第一个参数是下载地址,第二个是下载到本地的路径+文件名
        request.urlretrieve(img_url, img_path)
        print(img_name, '下载成功')

  

2.爬取boss相关的爬虫岗位信息(详情页)

import requests
from lxml import etree

headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}

url = 'https://www.zhipin.com/c101010100/?query=%E7%88%AC%E8%99%AB&page=1&ka=page-1'

page_text = requests.get(url=url, headers=headers).text

tree = etree.HTML(page_text)

li_list = tree.xpath('//div[@class="job-list"]/ul/li')

for li in li_list:
    job_url = 'https://www.zhipin.com' + li.xpath('.//h3/a/@href')[0]
    print(job_url)

    job_text = requests.get(url=job_url, headers=headers).text
    job_tree = etree.HTML(job_text)

    job_name = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/h1/text()')[0]
    job_salary = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/span/text()')[0]
    # 第三个工作到第七个工作和之前的html文件不一样,用管道符取出来,并取列表中的最后一项,就是公司名称
    job_addr = job_tree.xpath(
        '//div[@class="detail-content"]/div[5]/div[@class="name"]/text() |//*[@id="main"]/div[3]/div/div[2]/div[2]/div[4]/div[1]/text()')[
        -1]
    # 打印每个工作的岗位,薪资和公司名称
    print(job_name, job_salary, job_addr)

  

3.爬取(站长素材免费建立模板下载)(前10页)http://sc.chinaz.com/jianli/free.html

import requests
import random
from lxml import etree


headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36"
}

url = 'http://sc.chinaz.com/jianli/free_{}.html'

for page in range(1, 11):
    if page == 1:
        new_url = 'http://sc.chinaz.com/jianli/free.html'
    else:
        new_url = url.format(page)

    response = requests.get(url=new_url, headers=headers)
   #解决编码(乱码)问题 response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: # 简历模板的名字,可以解决乱码问题(第二种方法) res_name = res_name.encode('iso-8859-1').decode('gbk') res_name = div.xpath('./a/img/@alt')[0] # 文件存放的路径 res_path = './day118/03/' + res_name + '.rar' # 简历模板的url res_url = div.xpath('./a/@href')[0] # 请求简历模板的网址 res_text = requests.get(url=res_url, headers=headers).text # 用etree来进行数据分析 res_tree = etree.HTML(res_text) # 简历下载的网络地址列表(福建电信,厦门电信) down_list = res_tree.xpath('//div[@id="down"]/div[2]//a/@href') # 随机抽取一个下载地址 down_url = random.choice(down_list) # 请求下载地址,因为是压缩包,二进制文件,所以用content,不能用text!!!!!! down_res = requests.get(url=down_url, headers=headers).content print(res_path)
     # 写入的是二进制的数据,(所以不能写encoding='utf-8')
     with open(res_path, 'wb') as f:
       f.write(down_res)
     print(res_name, '下载完毕')


  

转载于:https://www.cnblogs.com/gyc-tzx/p/10940400.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值