# -*-coding:utf8-*-
import re
import urllib.request
import os
def save(imageName,data):
data=urllib.request.urlopen(data).read()
name="E:/Scrapy_Project/jdImages/"+imageName
file=open(name,"wb")
file.write(str(data))
file.flush()
file.close()
#1、获取京东某个页面的图片,但是urlretrieve报错file.close()
def craw(url,page):
html0=urllib.request.urlopen(url,timeout=3).read()
html1=str(html0)
print(html1)
pattern1='<div id="plist".+? <div class="page clearfix">'
result1=re.compile(pattern1).findall(html1)
print(result1)
result1=result1[0]
print(result1)
pattern2='<img width="220" height="220" data-img="1" src="//(.+?).jpg">'
imageslist=re.compile(pattern2).findall(result1)
x=1
for imageurl in imageslist:
imageurl="http://"+imageurl+".jpg"
imagename="E:/Scrapy_Project/jdImages/"+str(page)+str(x)+".jpg"
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,"code"):
x+=1
if hasattr(e,"reason"):
x+=1
urllib.request.urlcleanup()
x+=1
#2 获取csdn中某个url中的所有url
def getLink(url):
headers=('User-Agent',"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())
#re
pattern='(https?://[^\s)";]+\.(\w|/)*)'
link=re.compile(pattern).findall(data)
link=list(set(link))
return link
#3、获取糗事百科中的所有作者和内容;
def getContent(url,page):
headers=('User-Agent',"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode('utf-8')
userPattern='<h2>(.*?)</h2>'
contentPattern='<div class="content">(.*?)</div>'
userList=re.compile(userPattern,re.S).findall(data)
contentList=re.compile(contentPattern,re.S).findall(data)
x=1
for content in contentList:
content=content.replace('\n',"")
content=content.replace('<span>',"")
content=content.replace('</span>',"")
name="content"+str(x)
exec(name+'=content')
x+=1
y=1
for user in userList:
name="content"+str(y)
print("用户"+str(page)+str(y)+"是"+user)
print("内容是")
exec("print("+name+")")
print("\n")
y+=1
if __name__ == '__main__':
## for i in range(1,10):
## url="https://list.jd.com/list.html?cat=9987,653,659&page="+str(i)
## print(url)
## craw(url,i)
######################2
## url="http://blog.csdn.net"
## linklist=getLink(url)
## count=0
## for link in linklist:
## print(link[0])
## count+=1
## print ("count:"+str(count))
######################3
for i in range(1,2):
url="https://www.qiushibaike.com/"
getContent(url,i)
import re
import urllib.request
import os
def save(imageName,data):
data=urllib.request.urlopen(data).read()
name="E:/Scrapy_Project/jdImages/"+imageName
file=open(name,"wb")
file.write(str(data))
file.flush()
file.close()
#1、获取京东某个页面的图片,但是urlretrieve报错file.close()
def craw(url,page):
html0=urllib.request.urlopen(url,timeout=3).read()
html1=str(html0)
print(html1)
pattern1='<div id="plist".+? <div class="page clearfix">'
result1=re.compile(pattern1).findall(html1)
print(result1)
result1=result1[0]
print(result1)
pattern2='<img width="220" height="220" data-img="1" src="//(.+?).jpg">'
imageslist=re.compile(pattern2).findall(result1)
x=1
for imageurl in imageslist:
imageurl="http://"+imageurl+".jpg"
imagename="E:/Scrapy_Project/jdImages/"+str(page)+str(x)+".jpg"
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,"code"):
x+=1
if hasattr(e,"reason"):
x+=1
urllib.request.urlcleanup()
x+=1
#2 获取csdn中某个url中的所有url
def getLink(url):
headers=('User-Agent',"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())
#re
pattern='(https?://[^\s)";]+\.(\w|/)*)'
link=re.compile(pattern).findall(data)
link=list(set(link))
return link
#3、获取糗事百科中的所有作者和内容;
def getContent(url,page):
headers=('User-Agent',"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode('utf-8')
userPattern='<h2>(.*?)</h2>'
contentPattern='<div class="content">(.*?)</div>'
userList=re.compile(userPattern,re.S).findall(data)
contentList=re.compile(contentPattern,re.S).findall(data)
x=1
for content in contentList:
content=content.replace('\n',"")
content=content.replace('<span>',"")
content=content.replace('</span>',"")
name="content"+str(x)
exec(name+'=content')
x+=1
y=1
for user in userList:
name="content"+str(y)
print("用户"+str(page)+str(y)+"是"+user)
print("内容是")
exec("print("+name+")")
print("\n")
y+=1
if __name__ == '__main__':
## for i in range(1,10):
## url="https://list.jd.com/list.html?cat=9987,653,659&page="+str(i)
## print(url)
## craw(url,i)
######################2
## url="http://blog.csdn.net"
## linklist=getLink(url)
## count=0
## for link in linklist:
## print(link[0])
## count+=1
## print ("count:"+str(count))
######################3
for i in range(1,2):
url="https://www.qiushibaike.com/"
getContent(url,i)