python爬取图片源码_python抓取百度图片源码

#!/usr/bin/python

# -*- coding:utf-8 -*-

import httplib2

import urllib.request

import json

#import urllib2

import re

import os

import string

class BaiduImage(object):

def __init__(self):

super(BaiduImage, self).__init__()

print

u'图片获取中,CTRL+C 退出程序...'

self.page = 60 # 当前页数

if not os.path.exists(r'./image'):

os.mkdir(r'./image')

def request(self):

try:

urlError=""

while 1:

conn = httplib2.Http()

request_url = 'http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=风景&cg=girl&rn=60&pn=' + str(

self.page)

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',

'Content-type': 'test/html'}

# body = urllib.urlencode({'tn':'resultjsonavatarnew','ie':'utf-8','word':'%E7%BE%8E%E5%A5%B3','cg':'girl','pn':self.page,'rn':'60'})

# conn.request( request_url,'GET', body=None, headers=headers)

resp,content = conn.request(request_url,'GET', body=None, headers=headers)

# print r.status

if resp.status == 200:

data =content

#data =unicode(data, errors='ignore')

d= data.decode("UTF-8")

decode = json.loads(d)

urlError=decode

self.download(decode['imgs'])

self.page += 60

print(self.page)

except Exception as e:

print (e+urlError)

finally:

print(conn)

def download(self, data):

imgCount=0

for d in data:

# url = d['thumbURL'] 缩略图 尺寸200

# url = d['hoverURL'] 尺寸360

try:

url = d['objURL']

print(url)

data = urllib.request.urlopen(url).read()

pattern = re.compile(r'.*/(.*?)\.jpg', re.S)

item = re.findall(pattern, url)

if item is None:

continue

if ".jpg" in url:

FileName = str('image/') + item[0] + str('.jpg')

if f.__exit__(FileName):

with open(FileName, 'wb') as f:

f.write(data)

else:

print(FileName+"存在!")

except Exception as e:

print(e)

finally:

imgCount+=1

print(imgCount.__str__() +":"+url)

if __name__ == '__main__':

bi = BaiduImage()

bi.request()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要使用Python爬取网页内容源码,可以使用urllib模块中的urlopen函数来打开网址并读取源代码。以下是一个示例代码: ```python import urllib.request def grab(url): # 打开传入的网址 resp = urllib.request.urlopen(url) # 读取网页源码内容 data = resp.read() # 输入存储文件名 name = input("请定义文件名:") # 打开文件 file_name = open(name, "wb") # 将代码写入文件 file_name.write(data) # 关闭文件 file_name.close() print("下载源码完成") if __name__ == '__main__': # 按照格式输入网址 web_addr = input("请输入你要抓取的网址(例如http://www.baidu.com/):") try: grab(web_addr) except: print("网址输入有误") ``` 这段代码中,我们定义了一个`grab`函数,它接受一个网址作为参数。函数内部使用`urlopen`函数打开网址并读取源代码,然后将源代码写入一个本地文件中。你可以根据要修改文件名和存储路径。运行这段代码后,输入你要抓取的网址,即可将网页内容源码保存到本地文件中。 #### 引用[.reference_title] - *1* [python-爬虫(最后附爬取数据的源码)](https://blog.csdn.net/iconada/article/details/121895984)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insert_down28v1,239^v3^insert_chatgpt"}} ] [.reference_item] - *2* [python爬虫入门篇------爬取网页源代码](https://blog.csdn.net/wf134/article/details/78554764)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insert_down28v1,239^v3^insert_chatgpt"}} ] [.reference_item] - *3* [实战|手把手教你用Python爬虫(附详细源码)](https://blog.csdn.net/m0_59162248/article/details/128682228)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insert_down28v1,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值