import requests
import time
from bs4 import BeautifulSoup
import re
#判断链接打开是否正常
def get_url(url):
response=requests.get(url)
if response.status_code==200:
print('%s' % url)
print('success')
else:
print('%s' % url)
print('fail')
#获取当前主页的最大页数
def get_page_max():
rep=requests.get('http://www.tianhong.cn/list-5835.html')
page_soup=BeautifulSoup(rep.text,'html.parser')
page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
return page_max
#获取当前主页广告、logo链接
def get_main_html_pageurl(url):
rep_pictureurl=[]
rep=requests.get(url)
rep_page=BeautifulSoup(rep.text,'html.parser')
rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src')
rep_pictureurl.append(rep_page_url)
rep_logo=rep_page.find('div',class_='logo').find('img').get('src')
rep_pictureurl.append('http://www.tianhong.cn'+rep_logo)
return rep_pictureurl
#获取当前页的商品图片链接
def get_main_pictureurl(url):
rep_pictureurl=[]
rep=requests.get(url)
rep_page=BeautifulSoup(rep.text,'html.parser')
rep_page_url=rep_page.find('ul',class_='spList').find_all('img')
for line in rep_page_url:
line=re.findall(r'.*src="(.*)" .*',str(line))[0]
rep_pictureurl.append(line)
return rep_pictureurl
#获取当前页面商品链接
def get_commodity_url(url):
rep_url=[]
rep=requests.get(url)
page_soup=BeautifulSoup(rep.text,'html.parser')
page_url=page_soup.find('ul',class_='spList').find_all('a')
for line in page_url:
line=re.findall(r'.*a href="(.*)" tag=.*',str(line))
rep_url.extend(line)
return rep_url
#获取商品详情页的图片链接
def get_Details_url(url):
rep_url=[]
rep=requests.get(url)
page_soup=BeautifulSoup(rep.text,'html.parser')
page_url=page_soup.find('div',class_='m1l').find_all('a')
for line in page_url:
line1=re.findall('"(http.*?)"',str(line))
line2=re.findall(r'\'(http.*?)\'',str(line))
rep_url.extend(line1)
rep_url.extend(line2)
details_url=page_soup.find('div',class_='box').find_all('img')
for lines in details_url:
rep_url.append(lines.get('src'))
return rep_url
#判断当前页的所有图片是否可以正常打开
def run_main():
start=time.time()
for i in range(1,int(get_page_max())+1):
url='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i)
get_url(url)
for line in(get_main_html_pageurl(url)+get_main_pictureurl(url)):#主页面图片链接
get_url(line)
for lines in(get_commodity_url(url)):#商品链接
lines='http://www.tianhong.cn'+lines
get_url(lines)
for j in(get_Details_url(lines)):#商品详情图片链接
get_url(j)
print('完成第',i,'页')
end=time.time()
print(end-start)
if __name__=='__main__':
run_main()