200个化工网站批量爬取

最新推荐文章于 2024-04-23 13:52:13 发布

weixin_34128237

最新推荐文章于 2024-04-23 13:52:13 发布

阅读量351

点赞数

文章标签：爬虫 python

Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

顺利100网站64秒

200网站570秒就搞不懂了，差距太大了。。

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 08:53:08 2016
采集化工标准补录项目
@author: Administrator
"""
import requests,bs4,openpyxl,time
from openpyxl.cell import get_column_letter,column_index_from_string
#开始时间
timeBegin=time.clock()

excelName="hb_sites.xlsx"
sheetName="Sheet1"
wb1=openpyxl.load_workbook(excelName)
sheet=wb1.get_sheet_by_name(sheetName)
start=1

del_content1="标准编号："
del_content2="发布部门："
del_content3="实施日期："

#excel的行数
sheet.get_highest_row()
#excel的列数
sheet.get_highest_column()



requests.codes.ok

#每个网站爬取相应数据
def Craw(site):
    content_list=[]
    res=requests.get(site)
    res.encoding = 'gbk'
    soup1=bs4.BeautifulSoup(res.text,"lxml")
    StandardCode=soup1.select('h5')
    for i in StandardCode:
        content=i.getText()
        content_list.append(content)
        
    
    for i in content_list:
        if "标准编号" in i:
            i=i.strip(del_content1)
            sheet['B'+str(row)].value=i
        if "发布部门" in i:
            i=i.strip(del_content2)
            sheet['C'+str(row)].value=i
        if "实施日期" in i:
            i=i.strip(del_content3)
            sheet['D'+str(row)].value=i


def TimeCount():
    timeComsuming=timeEnd-timeBegin
    print ("time Comsuming:%f seconds" % timeComsuming)
    return timeComsuming

for row in range(2,200+1):
    site=sheet['A'+str(row)].value
    try:
        Craw(site)
    except:
        continue
    
    
wb1.save(excelName)

#结束时间
timeEnd=time.clock()
timeComsuming=TimeCount()