2019-12-05 Python3 学习爬取图片

上一次作业再尝试

import urllib.request
import re
url = 'http://blog.csdn.net/'
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener) #添加为全局,使所有方法都可以模拟成浏览器
data = urllib.request.urlopen(url).read().decode('utf-8','ignore')
pat = '<a href="https(://blog.csdn.net/.*?/\d+?)"'
result = re.compile(pat).findall(data)
result = ['http'+i for i in result]
result = set(result)
result = list(result)

for i in range(0,len(result)):
    file='D:/CSDNBLOG/'+str(i)+'.html'
    urllib.request.urlretrieve(result[i],file)
    print('第'+str(i+1)+'次')


data = urllib.request.urlopen('https://blog.csdn.net/kebi007/article/details/103268254').read()
fh=open('D:/test.html','wb')
fh.write(data)
fh.close() #还是乱码

代理

data=opener.open(url).read()
data2=data.decode('UTF-8','ignore')

import urllib.request
def use_proxy(url,proxy_address):
    proxy = urllib.request.ProxyHandler({'http':proxy_address})
    opener =urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8','ignore')
    return data
proxy_address='114.239.147.54:9999' #多试几次,成功
url = 'http://www.baidu.com'
data = use_proxy(url,proxy_address)
print(len(data))


#爬取图片
import urllib.request
import re
keyname='歪萌社'
key=urllib.request.quote(keyname)
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener) #添加为全局,使所有方法都可以模拟成浏览器
for i in range (0,1):
url = 'https://s.taobao.com/search?q='+key+'&type=p&tmhkh5=&spm=a21wu.241046-cn.a2227oh.d100&from=sea_1_searchbutton&catId=100&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s='+str(i*44)
url = 'http://s.taobao.com/search?q=%E6%AD%AA%E8%90%8C%E7%A4%BE&type=p&tmhkh5=&spm=a21wu.241046-cn.a2227oh.d100&from=sea_1_searchbutton&catId=100&bcoffset=0&ntoffset=6&p4ppushleft=1%2C48&s=0'
data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
pat = '"pic_url"'
image = re.compile(pat).findall(data)
for j in range(0,len(image)): #放弃
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值