Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865
顺利100网站64秒
200网站570秒就搞不懂了,差距太大了。。
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 08:53:08 2016
采集化工标准补录项目
@author: Administrator
"""
import requests,bs4,openpyxl,time
from openpyxl.cell import get_column_letter,column_index_from_string
#开始时间
timeBegin=time.clock()
excelName="hb_sites.xlsx"
sheetName="Sheet1"
wb1=openpyxl.load_workbook(excelName)
sheet=wb1.get_sheet_by_name(sheetName)
start=1
del_content1="标准编号:"
del_content2="发布部门:"
del_content3="实施日期:"
#excel的行数
sheet.get_highest_row()
#excel的列数
sheet.get_highest_column()
requests.codes.ok
#每个网站爬取相应数据
def Craw(site):
content_list=[]
res=requests.get(site)
res.encoding = 'gbk'
soup1=bs4.BeautifulSoup(res.text,"lxml")
StandardCode=soup1.select('h5')
for i in StandardCode:
content=i.getText()
content_list.append(content)
for i in content_list:
if "标准编号" in i:
i=i.strip(del_content1)
sheet['B'+str(row)].value=i
if "发布部门" in i:
i=i.strip(del_content2)
sheet['C'+str(row)].value=i
if "实施日期" in i:
i=i.strip(del_content3)
sheet['D'+str(row)].value=i
def TimeCount():
timeComsuming=timeEnd-timeBegin
print ("time Comsuming:%f seconds" % timeComsuming)
return timeComsuming
for row in range(2,200+1):
site=sheet['A'+str(row)].value
try:
Craw(site)
except:
continue
wb1.save(excelName)
#结束时间
timeEnd=time.clock()
timeComsuming=TimeCount()