Python 爬虫基础学习(three)

Python 爬虫基础学习(three)

1.正则表达式(回顾)

案例:糗事百科图片下载

# 1 创建request对象
# 2 获取网页源码
# 3 解析出图片的路径
# 4 下载图片
import urllib.request
import re

# 1 创建request对象
def create_request(page):
    url = 'https://www.qiushibaike.com/hot/page/' + str(page) + '/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request

# 2 获取网页源码
def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

# 3 解析出图片的路径
# <div class="thumb">
    # <a href="/article/122119754" target="_blank">
        # <img src="//pic.qiushibaike.com/system/pictures/12211/122119754/medium/S18OUY90ZVAU3WSK.jpg" alt="糗事#122119754" class="illustration" width="100%" height="auto">
    # </a>
# </div>
def parse_srclist(content):
    pattern = re.compile('<div class="thumb">.*?<img src="(.*?)" alt=".*?"',re.S)
    src_list = pattern.findall(content)
    return src_list

# 4 下载图片
def down_load(src_list):
    for src in src_list:
        name = src.split('/')[-1]
        filename = './qiubai/' + name
        url = 'https:' + src
        urllib.request.urlretrieve(url=url,filename=filename)



if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))
    for page in range(start_page,end_page+1):
        request = create_request(page)
        content = get_content(request)
        src_list = parse_srclist(content)
        down_load(src_list)
2.xpath
xpath基本语法:
	1.路径查询
		//:查找所有子孙节点,不考虑层级关系
		/ :找直接子节点
	2.谓词查询
		//div[@id]  
		//div[@id="maincontent"]    
	3.属性查询
		//@class         
	4.模糊查询
		//div[contains(@id, "he")]   
		//div[starts-with(@id, "he")] 
	5.内容查询
		//div/h1/text()
	6.逻辑运算
		//div[@id="head" and @class="s_down"]
		//title | //price
	7.索引+1:
	//tbody[@id='datalist']/tr/td[3]/span/text()
	
	td[3] : td同级的第三个
xpath使用:
	注意:提前安装xpath插件
	1.安装lxml库      
			pip install lxml -i https://pypi.douban.com/simple
	2.导入lxml.etree  
			from lxml import etree
	3.etree.parse()   解析本地文件
		    html_tree = etree.parse('XX.html')	
	4.etree.HTML()    服务器响应文件
		    html_tree = etree.HTML(response.read().decode('utf-8')	
	4.html_tree.xpath(xpath路径)

应用案例:1.使用xpath解析糗事百科页面中所有的图片路径 --》xpath和正则的对比

# 1 创建request对象
# 2 获取页面源码
# 3 解析路径
# 4 下载

import urllib.request
from lxml import etree

url = 'https://www.qiushibaike.com/hot/'
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    }
# 1 创建request对象
request = urllib.request.Request(url=url,headers=headers)
# 2 获取页面源码
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# 3 解析路径
tree = etree.HTML(content)
src_list = tree.xpath('//div[@id="content-left"]/div/div[@class="author clearfix"]/a/img/@src')
name_list = tree.xpath('//div[@id="content-left"]/div/div[@class="author clearfix"]/a[2]/h2/text()')
for i in range(len(src_list)):
    url = 'https:' + src_list[i]
    name = name_list[i].strip('\n')
    filename = './qbtouxiang/'+name+'.jpg'
    # print(filename)
    urllib.request.urlretrieve(url=url,filename=filename)

​ 2.使用xpath解析糗事百科的用户信息(用户头像,用户名字,用户等级)–》用户信息不对等

# 需求:下载前十页的用户信息
# 解决方案:1 创建request对象
#        2 获取页面源码
#        3 解析
#        4 保存数据
import urllib.request
from lxml import etree

def create_request(page):
    url = 'https://www.qiushibaike.com/hot/page/' + str(page) + '/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content


def save_content(content):
    tree = etree.HTML(content)
    # //div[@id="content-left"]/div/div[@class="author clearfix"]//img/@src
    src_list = tree.xpath('//div[@id="content-left"]/div/div[@class="author clearfix"]//img/@src')
    # //div[@id="content-left"]/div/div[@class="author clearfix"]//img/@alt
    name_list = tree.xpath('//div[@id="content-left"]/div/div[@class="author clearfix"]//img/@alt')
    # //div[@id="content-left"]/div/div[@class="author clearfix"]/div/text()
    level_list = tree.xpath('//div[@id="content-left"]/div/div[@class="author clearfix"]/div/text()')

    users = []

    for i in range(len(src_list)):
        src = src_list[i]
        name = name_list[i]

        # 如果src等于这个值  那么在列表中插入一个空串
        if src == '//static.qiushibaike.com/images/thumb/anony.png?v=b61e7f5162d14b7c0d5f419cd6649c87':
            level_list.insert(i,'')

        level = level_list[i]

        user = {}

        user['src']=src
        user['name']=name
        user['level']=level

        users.append(str(user))

    with open('user.txt','w',encoding='utf-8')as fp:
        fp.write(str(users))




if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))
    for page in range(start_page,end_page+1):
        request = create_request(page)
        content = get_content(request)
        save_content(content)

​ 3.站长素材图片抓取并且下载(http://sc.chinaz.com/tupian/shuaigetupian.html)–》懒加载

原理:先将img标签中的src链接设为同一张图片(空白图片),将其真正的图片地址存储再img标签的自定义属性中(比如data-src)。当js监听到该图片元素进入可视窗口时,即将自定义属性中的地址存储到src属性中,达到懒加载的效果。优点:这样做能防止页面一次性向服务器响应大量请求导致服务器响应慢,页面卡顿或崩溃等问题。

既然懒加载的原理是基于判断元素是否出现在窗口可视范围内,图片在可视范围内就加载,不在可视范围内就先不加载




# //div[@id="container"]/div/div/a/img/@alt
import urllib.request

def create_request(page):
    # http://sc.chinaz.com/tag_tupian/HeiRen_3.html
    base_url = 'http://sc.chinaz.com/tag_tupian/HeiRen'
    if page == 1:
        url = base_url + '.html'
    else:
        url = base_url + '_' + str(page) + '.html'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request


def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

from lxml import etree
def save_content(content):
   tree = etree.HTML(content)
   alt_list = tree.xpath('//div[@id="container"]/div/div/a/img/@alt')
   src_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
   for i in range(len(src_list)):
       alt = alt_list[i]
       src = src_list[i]
       suffix = src.split('.')[-1]
       filename = './blackwomen/' + alt + '.' +suffix
       urllib.request.urlretrieve(url=src,filename=filename)


if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))
    for page in range(start_page,end_page+1):
        request = create_request(page)
        content = get_content(request)
        save_content(content)

3.JsonPath
jsonpath的安装及使用方式:
			pip安装: 
				   pip install jsonpath
			jsonpath的使用:
                    obj = json.load(open('json文件', 'r', encoding='utf-8'))
                    ret = jsonpath.jsonpath(obj, 'jsonpath语法')
json对象的转换
	json.loads()
		是将字符串转化为python对象
	json.dumps()
		将python对象转化为json格式的字符串
		
#注意此处的url为network查找的接口,否则不能转化成json类型

#多页查询怎么办(page)?
	json.load()
		读取json格式的文本,转化为python对象
		json.load(open(a.json))
	json.dump()
		将python对象写入到文本中

教程连接(http://blog.csdn.net/luxideyao/article/details/77802389)

应用案例:智联招聘(薪水,公司名称,职位需求)

# 爬取智联招聘的职位,薪水,公司名称

import urllib.request

url = 'https://fe-api.zhaopin.com/c/i/sou?pageSize=90&cityId=538&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88&kt=3&=0&_v=0.29127774&x-zp-page-request-id=177fd3f08d9942488c2f482baaad9a3d-1565772375633-677458&x-zp-client-id=94df40c0-69a5-40f2-9d66-0bbe28f523a4'

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    }
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

with open('zhilian.json','w',encoding='utf-8')as fp:
    fp.write(content)

import json
import jsonpath

obj = json.load(open('zhilian.json','r',encoding='utf-8'))

jobName_list = jsonpath.jsonpath(obj,'$..jobName')
salary_list = jsonpath.jsonpath(obj,'$..salary')
companyName_list = jsonpath.jsonpath(obj,'$..company.name')



for i in range(len(jobName_list)):
    jobname = jobName_list[i]
    salary = salary_list[i]
    companyname = companyName_list[i]

    job = {}

    job['jobname']=jobname
    job['salary']=salary
    job['companyname']=companyname

    with open('job.json','a',encoding='utf-8')as fp:
        fp.write(str(job))

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大大枫free

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值