** ***python之初识爬虫下篇*****
自从上次写了一个简易的小脚本之后,就在写其他方面的东西了,昨天无意间翻出来运行了下我之前写的,发现是这样的
之前的不能用了,那就改一下吧
对之前的各个点打印出来和以前对比,发现
之前的是这样的。。。
问题发现了,再写一个url拼接就OK
最终代码如下
from urllib import request
import re
import urllib
import os
from urllib import parse
def gethtml(url):
page=urllib.request.urlopen(url)
html=page.read().decode('utf-8')
print(html)
return html
def geturls(html):
pa='<a href="(/tupian/.*\.html)"'
ma=re.compile(pa)
urlslist=ma.findall(html)
return urlslist
def geturllist(html):
urls = [parse.urljoin("http://sc.chinaz.com",u) for u in geturls(html)]
return urls
def getimgs(html):
reg = '<img src2="(//.*.jpg)"'
imgre = re.compile(reg)
imglist = imgre.findall(html)
imglist=[parse.urljoin("http:",i) for i in imglist]
x = 0
path = 'D:\\test'
if not os.path.isdir(path):
os.makedirs(path)
paths = path + '\\'
for imgurl in imglist:
urllib.request.urlretrieve(imgurl, '{0}{1}.jpg'.format(paths, x))
x = x + 1
return imglist
def gethtmls():
x=0
for m in urls:
print(m)
htmls=urllib.request.urlopen(m).read().decode('utf-8')
print(htmls)
reg = '<img src2="(//.*.jpg)"'
imgre = re.compile(reg)
imglist = imgre.findall(htmls)
imglist = [parse.urljoin("http:", i) for i in imglist]
path = 'D:\\test'
if not os.path.isdir(path):
os.makedirs(path)
paths = path + '\\'
for imgurl in imglist:
imgurl=parse.urljoin("http://",imgurl)
print(imgurl)
urllib.request.urlretrieve(imgurl,'{0}{1}.jpg'.format(paths, x))
x = x + 1
html=gethtml('http://sc.chinaz.com/tupian/')
urls=geturllist(html)
gethtmls()
注:此代码仅作为学习交流使用