目录
练习:从网上爬取图片到本地
图片来自1号店
可以先将一号店的网页代码爬取到一个HTML中
import urllib.request
import os
import re
def imageCrawler(url,topath):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
with open(r'a.html','wb') as f:
f.write(html)
url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)
a.html
在网页中找到每张图片的img src来源
然后利用编码去获取(.*?)里的图片地址,通过urllib.request.urlretrieve把图片下载到本地存储
最终代码:
import os
import re
def imageCrawler(url,topath):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
# with open(r'a.html','wb') as f:
# f.write(html)
# pat = r'<div style="position: relative">(.*?)</div>'
pat = r'<div style="position: relative">\n<img src="//(.*?)"/>'
re_image = re.compile(pat,re.S)
imagelist = re_image.findall(html)
print(imagelist)
print(len(imagelist))
# print(imagelist[0])
num = 1
for imageurl in imagelist:
path = os.path.join(topath,str(num)+'.jpg')
num += 1
#把图片下载到本地存储
urllib.request.urlretrieve('http://'+imageurl,filename=path)
url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)
运行结果:
练习:爬取网络中的QQ号
从豆瓣的这个页面中爬取QQ号
代码:
import urllib.request
import os
import re
import ssl
from collections import deque
def writeFile1(htmlBytes,topath):
with open(topath,'wb') as f:
f.write(htmlBytes)
def writeFileStr(htmlBytes,topath):
with open(topath,'wb') as f:
f.write(htmlBytes)
def gethtmlbytes(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
}
req = urllib.request.Request(url, headers=headers)
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req,context=context)
return response.read()
def qqCrawler(url,topath):
htmlbytes = gethtmlbytes(url)
# writeFile1(htmlbytes,r'c.html')
# writeFileStr(htmlbytes,r'c.txt')
htmlStr = str(htmlbytes)
#爬取QQ号
pat = r'[1-9]\d{4,9}'
re_qq = re.compile(pat)
qqlist = re_qq.findall(htmlStr)
#QQ号列表去重
qqlist = list(set(qqlist))
#将爬到的QQ号写入txt
f = open(topath,'a')
for qqstr in qqlist:
f.write(qqstr+'\n')
f.close()
# print(qqlist)
# print(len(qqlist))
#爬取里面的一些网址
pat = '(((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
re_url = re.compile(pat)
urllist = re_url.findall(htmlStr)
# QQ号列表去重
urllist = list(set(urllist))
# print(urllist)
# print(len(urllist))
# print(urllist[10])
return urllist
url = 'https://www.douban.com/group/topic/110094603/'
topath = r'b.txt'
# qqCrawler(url,topath)
#设置中央控制器
def center(url,topath):
queue = deque()
queue.append(url)
while len(queue) != 0:
targetUrl = queue.popleft()
urllist = qqCrawler(targetUrl,topath)
for item in urllist:
tempurl = item[0]
queue.append(tempurl)
center(url,topath)
运行结果:可以爬很久,直接停止了
一起学习,一起进步 -.- ,如有错误,可以发评论