Python爬虫学习记录（1）——百度贴吧图片下载

最新推荐文章于 2024-05-03 11:07:56 发布

骆小盼

最新推荐文章于 2024-05-03 11:07:56 发布

阅读量459

点赞数 1

文章标签： python 爬虫图片百度

本文链接：https://blog.csdn.net/sinat_25306091/article/details/47862949

版权

#!/usr/bin/python
#coding=utf-8
import os
from urllib.request import urlopen
from urllib.request import urlretrieve
import re
def getHtml(url):#获取网页的函数
    page = urlopen(url)
    html = page.read()
    return html

def getImg(html,id,page_num): #获取图片的函数
    reg = r'http:\/\/imgsrc.baidu.com\/forum\/.{70,100}jpg'
    imgre = re.compile(reg)
    html = str(html)
    f = open("/usr/lxp/python_test/getImg_Python/out_" + str(page_num),"w+")
    f.write(html)
    f.close()
    imglist = imgre.findall(html)
    x = 0
    for imgurl in imglist:
        save_name = 'topic_'+ id + '_' + str(page_num) + '_%s.jpg' % x
        print('download' + save_name +' sucessfully from ' + imgurl)
        urlretrieve(imgurl,save_name)
        x+=1
    return imglist

def getAllImg(topic_id):#解析网页按页数下载
    page_num = 1
    html_len=0
    os.system('mkdir topic_' + topic_id)
    while True :
        html = getHtml("http://tieba.baidu.com/p/" + topic_id + '?see_lz=1&pn=' + str(page_num))
        print(str(html_len) + ' ' + str(len(html)))
        if html_len == len(html):
            break
        getImg(html,topic_id,page_num)
        os.system('mv topic_' + topic_id + '*.jpg topic_' + topic_id)
        html_len = len(html)
        page_num = page_num + 1
    return page_num

topic_id = input("topic id:")
getAllImg(topic_id)