python爬table 数据

from lxml import etree
import urllib.request
import time
import datetime
import xlwt
from asq.initiators import query

#得至需要爬的地址的内容
def getHtml(url):  
    html = urllib.request.urlopen(url).read()  
    return html

data = list()  

#爬的地址是分页的所以定义一个数据的地址

urls = ['http://www.178448.com/fjzt-2.html?page={}'.format(str(i)) for i in range(1,70)]
for url in urls:
    html = getHtml(url)

# 得到数据后按/tbody得到所有行
    tree =etree.HTML(html) 
    heads =tree.xpath('//table[@border="1"]/thead/tr/td/text()') 
    rows =tree.xpath('//table[@border="1"]/tbody/tr')


   #把得到的行加到 List中 分行加入,不然,他就在一行中.
    for row in rows:

        data.append([c.text for c in row.getchildren()])

#停1秒后再爬

    time.sleep(1)
#for row in data: print(row)

print(len(data))

#数据当 > 2018-05-13 这个时间的作group count 操作

res_list = query(data).where(lambda p: time.mktime(datetime.datetime.strptime(p[5], "%Y-%m-%d %H:%M").timetuple())>time.mktime(datetime.datetime.strptime('2018-05-13 00:00', "%Y-%m-%d %H:%M").timetuple())).group_by(lambda p:p[3]).select(lambda p:[p[0][3],p.count(lambda p: p[3])]).to_list()

 

print(res_list)    

#下面是把爬到的数据存到 excel表格中

#workbook = xlwt.Workbook(encoding='utf-8')
#data_sheet = workbook.add_sheet('demo')
#index = 0
#for row in data:
 #   for x, item in enumerate(row):
#        data_sheet.write(index, x, item)
 #   index += 1
#workbook.save('demo.xls')

 

发现一个问题 如果 tr里面有 <a> 的标记不会得到名称,所以又改了一下:

# -*- coding: utf-8 -*
from lxml import etree
import xml.etree.ElementTree as ET
import urllib.request
import time
import datetime
import xlwt
from asq.initiators import query

def getHtml(url):  
    html = urllib.request.urlopen(url).read()  
    return html
data = list()  
urls = ['http://219.140.69.151/opac/newbookpub?page={}'.format(str(i)) for i in range(1,10)]
for url in urls:
    html = getHtml(url)

    tree =etree.HTML(html) 
    #heads =tree.xpath('//table/tr/th/text()') 
    rows =tree.xpath('//table/tr')
    
   
    for row in rows:
        #data.append([c.text for c in row.getchildren()])
        #data.append([c.text for c in row.getchildren()])
        strs=(etree.tostring(row.getchildren()[0].xpath('//td/a')[0],encoding = "UTF-8").decode('utf-8'))
        #print(strs)
        #print(row.getchildren()[1].text.replace('\t','').replace('\r\n',''))
        #print(row.getchildren()[2].text.replace('\t','').replace('\r\n',''))
        #print(row.getchildren()[3].text.replace('\t','').replace('\r\n',''))
        data.append([strs,row.getchildren()[1].text.replace('\t','').replace('\r\n',''),row.getchildren()[2].text.replace('\t','').replace('\r\n',''),row.getchildren()[3].text.replace('\t','').replace('\r\n','')])
    time.sleep(1)
for row in data: print(row)
print(len(data))
#res_list = query(data).where(lambda p: time.mktime(datetime.datetime.strptime(p[5], "%Y-%m-%d %H:%M").timetuple())>time.mktime(datetime.datetime.strptime('2018-05-13 00:00', "%Y-%m-%d %H:%M").timetuple())).group_by(lambda p:p[3]).select(lambda p:[p[0][3],p.count(lambda p: p[3])]).to_list()

#print(data)    
#workbook = xlwt.Workbook(encoding='utf-8')
#data_sheet = workbook.add_sheet('demo')
#index = 0
#for row in data:
 #   for x, item in enumerate(row):
#        data_sheet.write(index, x, item)
 #   index += 1
#workbook.save('demo.xls')
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值