import urllib.request
import re
url = 'http://blog.csdn.net/'
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener) #添加为全局,使所有方法都可以模拟成浏览器
data = urllib.request.urlopen(url).read().decode('utf-8','ignore')
pat = '<a href="https(://blog.csdn.net/.*?/\d+?)"'
result = re.compile(pat).findall(data)
result = ['http'+i for i in result]
result = set(result)
result = list(result)
for i in range(0,len(result)):
file='D:/CSDNBLOG/'+str(i)+'.html'
urllib.request.urlretrieve(result[i],file)
print('第'+str(i+1)+'次')
data = urllib.request.urlopen('https://blog.csdn.net/kebi007/article/details/103268254').read()
fh=open('D:/test.html','wb')
fh.write(data)
fh.close() #还是乱码
代理
data=opener.open(url).read()
data2=data.decode('UTF-8','ignore')
import urllib.request
def use_proxy(url,proxy_address):
proxy = urllib.request.ProxyHandler({'http':proxy_address})
opener =urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8','ignore')
return data
proxy_address='114.239.147.54:9999' #多试几次,成功
url = 'http://www.baidu.com'
data = use_proxy(url,proxy_address)
print(len(data))
#爬取图片
import urllib.request
import re
keyname='歪萌社'
key=urllib.request.quote(keyname)
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener) #添加为全局,使所有方法都可以模拟成浏览器
for i in range (0,1):