做了字符串的筛选,re.search(r"\d+\.?\d*",rate1[i*3].string).group()
search之后返回的是个数组,因为只有一个元素,所以group()就返回一个元素,否则返回的是带[]的
# coding=utf-8
import requestsimport re
from lxml import etree
import time
import pandas as pd
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
user_agent = 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'
headers = {'User-Agent':user_agent}
#定义一个爬虫
class spider(object):
def __init__(self):
print u'开始爬取内容。。。'
#getsource用来获取网页源代码
def getsource(self,url):
html = requests.get(url)
return html.text
#changepage用来生产不同页数的链接
def changepage(self,url,total_page):
now_page = int(re.search('page=(\d+)',url,re.S).group(1)) #可修改
page_group = []
for i in range(now_page,total_page+1):
link = re.sub('page=\d+','page=%s'%i,url,re.S) #可修改
page_group.append(link)
return page_group
#getpic用来爬取一个网页图片
def getpic(self,source):
selector = etree.HTML(source)
pic_url = selector.xpath('//ul[@class="ali"]/li/div/a/img/@src') #可修改
return pic_url
#savepic用来保存结果到pic文件夹中
def savepic(self,pic_url):
picname=re.findall('(\d+)',link,re.S) #可修改
picnamestr = ''.join(picname)
i=0
for each in pic_url:
print 'now downloading:' + each
pic = requests.get(each)
fp = open('pic\\'+picnamestr +'-'+str(i)+ '.jpg', 'wb')
fp.write(pic.content)
fp.close()
i += 1
#ppic集合类的方法
def ppic(self, link):
print u'正在处理页面:' + link
html = picspider.getsource(link)
pic_url = picspider.getpic(html)
picspider.savepic(pic_url)
def get_info(self,links):
game_name=[]
deposit=[]
period=[]
rebate=[]
for y in links:
try:
r2=requests.get(y,headers=headers)
except:
print ('wrong %s' % main_url)
else:
s2=BeautifulSoup(r2.text,'lxml')
rate=s2.find_all('dd')
rate1=s2.find_all('span')
print "---begin to print rate---"
for i in range(0,len(rate)/5-1):
game_name.append(rate[i*5].string)
deposit.append(re.search(r"\d+\.?\d*",rate1[i*3].string).group())
period.append(re.search(r"\d+\.?\d*",rate1[i*3+1].string).group())
rebate.append(re.search(r"\d+\.?\d*",rate1[i*3+2].string).group())
p={'游戏名称':game_name,
'押金':deposit,
'周期':period,
'返利':rebate}
return p
time1=time.time()
if __name__ == '__main__':
url = 'http://www.wowpower.com/showNewGame?sort=looks&sortseq=down&page=1' #可修改
picspider = spider()
all_links = picspider.changepage(url,11) #可修改
print all_links
p=pd.DataFrame(picspider.get_info(all_links))
p.to_csv('e://rate1.csv',index=False,encoding='gbk',
columns=['游戏名称','押金','周期','返利'],
header=['游戏名称','押金','周期','返利'])
'''
for link in all_links:
picspider.ppic(link)
'''
time2=time.time()
print u'耗时:'+str(time2-time1)