个人学习爬虫的实例小结

最新推荐文章于 2024-05-27 13:16:19 发布

虚幻交界

最新推荐文章于 2024-05-27 13:16:19 发布

阅读量416

点赞数

文章标签： python

本文链接：https://blog.csdn.net/Zz_er/article/details/105921942

版权

个人学习爬虫的实例小结

最近为了帮忙装修淘宝页面的图片，得把商品从一个亲戚的店铺搬到另一个店铺，总觉得很麻烦，于是我便用打算锤炼我的爬虫技术。

需求是获取页面中的主图，主图视频，展示的颜色图片，还有商品详情介绍的图片。

先来导入一些模块

import requests,re,os,time,random #requests模块,re正则表达式模块，os与系统交互的模块，time模块可以获取时间，random随机模块
from bs4 import BeautifulSoup   #引入BeautifulSoup模块
from uuid import uuid4  #用来生成随机不重复的文件名
from multiprocessing import Pool  #使用多进程来加快速度

为了方便使用，我在设计时使用的是动态的url，首先程序运行的时候需要手动输入淘宝页面的url链接。

if __name__=='__main__':
    
    while True:
        s = input('请输入网页地址<0退出>：')
        start = time.time()   #获取程序开始运行时的时间
        s=s.strip()				#去除输入字符两端空格
        Po = Pool(3)			#申请3个进程
        if s=='0':				
            break
        main(s)				#调用main函数
        end= time.time()	 #获取程序结束时的时间
        print('爬虫耗时：%.2fs'%(end-start))

接着来构造main函数

获取页面内容，解析并保存图片，视频，描述文字等信息

def main(url):
    print('The crawler start')
    
    
    html = gethtml(url)   #下载页面内容
    if not html:
        print('error 未能成功获取链接')
        return
    
    getcontent(html.text)   #解析获取的html内容
    print('The crawler exit')

接着构造一下gethtml函数，getcontent函数

def gethtml(url,num_retries=2):
    print('Download',url)
    #设置用户代理池
    user_agents=['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
         'Mozilla/5.0',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
              ]
    #设置请求头
    kv={
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': '',          #cookie这项我就不给了，需要的话复制自己浏览器上的cookie这就可以
    'cache-control': 'max-age=0',
    'referer': 'https://www.taobao.com/?spm=2013.1.0.0.3a0d5b1cKeEW7w',
    'upgrade-insecure-requests': '1',
    'User-Agent':random.choice(user_agents)
        }
    
    
    try:
        r = requests.get(url,headers=kv,timeout=7)  #发送请求，获取响应对象
        
        r.encoding = r.apparent_encoding   #根据检查响应内容设置编码方式

        
        if r.status_code >= 400:    #判断网页转态码
            print('Download error',r.text)
            
        if num_retries and 500<= r.status_code < 600:	#状态码500~599是服务器端出错，这时可以尝试一下发起请求
            return gethtml(url,num_retries=num_retries-1)         #这个语句可以在一次请求失败后再尝试两次
            
        if r.status_code == 200:    #如果转态码为200，表示连接正常
            print('Download Successful!')
        return r
    except:
        print('Download error')
        r=None
        
    return r

解析内容的时候有些内容容易获取，有些内容较难获取，主图图片，颜色展示图，文本描述比较容易获取，直接在返回的html代码中，而详情内容里的图片和主图视频则较难获取，是动态生成的

首先是我准备用BeautifulSoup定位如下图所示的图片位置，可以用BeautifulSoup的树状结构捕捉到
在这里插入图片描述

在python中调试返回的结果是 “描述加载中” ，于是查看了一下源码，源码中没用相应的内容所以是异步加载，果然没这么简单。

在这里插入图片描述

按下F12打开开发者模式，按下面操作找到目标json文件，根据我的知识很多网站都用json来向前端传输信息
在这里插入图片描述

然后下图再按从左到右的箭头找到json文件

在这里插入图片描述

如上图所示的json文件中包含了过程目标图片url的重要片段，可用同个正则表达式匹配或其他方式获取这些片段，具体内容见代码。

视频也是采用类似的方法，从network中找到目标视频文件，从hearers中可以看到对应它的url链接根据多个视频的url观察，最后断定和videoId,videoOwnerId的数据有关
在这里插入图片描述

def getcontent(html):
    print('数据解析中...')
    jpg_list1=[]
    jpg_list2=[]
    
    color_list=[]
    
    data = dict()
    

    soup = BeautifulSoup(html,'html.parser')
    #商品标题
    title = soup.find('h3' ,class_="tb-main-title").attrs['data-title']

    #获取详情
    try:
        detail = soup.find('ul',class_='attributes-list').text
        print('详情描述解析成功！')
    except:
        print('详情描述解析失败！')
    #获取首图
    try:
        jpgs = soup.find('ul', id="J_UlThumb", class_="tb-thumb tb-clearfix")
            
        for each in jpgs('img'):
            if 'http' in each.attrs['data-src']:
                url_jpg = each.attrs['data-src'].replace('50x50','400x400')
            else:
                url_jpg = 'http:'+each.attrs['data-src'].replace('50x50','400x400')
            jpg_list1.append(url_jpg)
        print('主图解析成功!')
    except:
        print('主图解析失败')
    #获取颜色
    try:
        colors = soup.find('ul',class_="J_TSaleProp tb-img tb-clearfix")
        
        for each in colors('a'):
            try:
                if 'http' in each.attrs['style']:
                    		   color_list.append(each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
                else:
                    color_list.append('http:'+each.attrs['style'].replace('jpg_30x30','jpg_400x400').replace('background:url(','').replace(') center no-repeat;',''))
            except: pass
        print('颜色展示解析成功!')
    except:
        print('颜色展示解析失败！')
    
    #获取描述图片
    try:
        findlink = re.compile('(//tds.alicdn.com/json/item_imgs.*),')
        find_num = re.compile(r'\d{5,15}')
        link = 'https:'+findlink.search(html).group(0)[:-2]
        link_res = gethtml(link)
        if link_res.status_code==200:
            content_link = eval('{'+link_res.text.replace('\n','').replace('\r','').replace('\t','')[63:-1])
        for l in content_link.keys():
            #链接url
            li = 'https://img.alicdn.com/imgextra'+find_num.search(l).group(0)+'/'+l
            jpg_list2.append(li)
        print('描述图片解析成功！')
    except:
        print('描述图片解析失败!')


    #获取视频
    try:
        find_videoid=re.compile(r'"videoId":"\d{5,20}"')
        videoid = find_videoid.search(html).group(0)[11:-1]
        find_ownerid=re.compile(r'"videoOwnerId":"\d{5,20}"')
        ownerid = find_ownerid.search(html).group(0)[16:-1]
        videolink = 'https://cloud.video.taobao.com/play/u/%s/p/1/e/6/t/1/%s.mp4'%(ownerid,videoid)
        print('视频解析成功！')
    except:
        print('视频解析成功！')
        videolink=None



    
    data['标题'] = title
    data['首图'] = jpg_list1
    data['颜色'] = color_list
    data['详情'] = detail
    data['详情图'] = jpg_list2
    data['视频'] = videolink
    print('Data parsed')
    saveData(data)

最后是saveData

def savepicture_main(root,url):
    with open(root+'/图片/首图/'+str(uuid4())+'.jpg','wb') as f:
        r = gethtml(url)
        f.write(r.content)
        f.close()
def savepicture_color(root,url):
    with open(root+'/图片/颜色/'+str(uuid4())+'.jpg','wb') as f:
        r = gethtml(url)
        f.write(r.content)
        f.close()
def savepicture_detail(root,url):
    with open(root+'/图片/详情图/'+str(uuid4())+'.jpg','wb') as f:
        r = gethtml(url)
        f.write(r.content)
        f.close()

def getvideo(root,url):
    with open(root+'/图片/首图/'+str(uuid4())+url[-4:],'wb') as f:
        r = gethtml(url)
        f.write(r.content)
        f.close()

    
def saveData(data):
    global Po
    print('数据保存中...')
    root = './%s'%data['标题']+str(time.time())[-8:]
    if  not os.path.exists(root) :
        os.mkdir(root)
       
    if  not os.path.exists(root+'/图片') :
        os.mkdir(root+'/图片')
        
    if  not os.path.exists(root+'/图片/首图') :
        os.mkdir(root+'/图片/首图')
        
    if  not os.path.exists(root+'/图片/颜色') :
        os.mkdir(root+'/图片/颜色')
        
    if  not os.path.exists(root+'/图片/详情图') :
        os.mkdir(root+'/图片/详情图')
        
    if  not os.path.exists(root+'/详情') :
        os.mkdir(root+'/详情')

    Po.apply_async(getvideo,args=(root,data['视频']))
    for u1 in data['首图']:
        Po.apply_async(savepicture_main,args=(root,u1))
           
    for u2 in data['颜色']:
        Po.apply_async(savepicture_color,args=(root,u2))

    for u3 in data['详情图']:
        Po.apply_async(savepicture_detail,args=(root,u3))
    
    with open(root+'/详情/详情.txt','w',encoding='utf-8') as f:
        f.write(data['详情'])
        f.close()
        
    Po.close()
    Po.join()
    print('数据保存完毕！')

虚幻交界

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
个人学习爬虫的实例小结

个人学习爬虫的实例小结最近为了帮忙装修淘宝页面的图片，得把商品从一个亲戚的店铺搬到另一个店铺，总觉得很麻烦，于是我便用了几天时间练练我的爬虫技术。需求是获取页面中的主图，主图视频，展示的颜色图片，还有商品详情介绍的图片。先来导入一些模块import requests,re,os,time,randomfrom bs4 import BeautifulSoup #引入Beautif...
复制链接

扫一扫