爬虫小程序(实习僧网抓取数据)

# -*- coding:utf-8 -*-  
import re
import urllib
import urllib2
import time
from pyExcelerator import *
"""
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent
"""
#headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) Chrome/29.0.1547.66')
headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html


def getHtml(url):
    req = urllib2.Request(url,headers=headers)
    res = urllib2.urlopen(req).read()
    return res

#岗位
def getImg(html):
    #reg = r'src="(.+?\.jpg)" pic_ext'

    reg = r'title="(.*?)" target="_blank">'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist

#地点
def getPlace(html):
    
    reg = re.compile('<i class="addr">.*?b;</i><span>(.*?)</span>',re.S)
    placere = re.compile(reg)
    placelist = re.findall(placere,html)
    return placelist

#薪资
def getSalary(html):
    #reg = r'src="(.+?\.jpg)" pic_ext'

    #reg = r'title="(.*?)" target="_blank">'
    reg = re.compile('<span class="money_box">.*?b;</i>(.*?)\s\s</span>',re.S)
    
    salaryre = re.compile(reg)
    salarylist = re.findall(salaryre,html)
    return salarylist

#实习时间
def getDay(html):
    #reg = r'src="(.+?\.jpg)" pic_ext'

    
    reg = re.compile('<span class="day_box">.*?a;</i>(.*?)\s</span>',re.S)
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist

#公司
def getCompany(html):
    #reg = r'src="(.+?\.jpg)" pic_ext'

    reg = re.compile('class="company_name" target="_blank" title="(.*?)"',re.S)
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist

#发布时间
def getUpdate(html):
    #reg = r'src="(.+?\.jpg)" pic_ext'

    reg = re.compile('<i class="job_time">.*?c;</i>(.*?)\s</span>',re.S)
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist


   
#html = getHtml("http://www.shixiseng.com/interns?p=1")


w = Workbook()                  #创建一个工作簿
ws = w.add_sheet('results')     #创建一个工作表
ws.write(0,0,u'岗位')
ws.write(0,1,u'公司')
ws.write(0,2,u'地点')
ws.write(0,3,u'薪资')
ws.write(0,4,u'实习时间')
ws.write(0,5,u'发布时间')


num2 = 1 #每页打印起始行



for k in range(1,10):
   time.sleep(2)
   #k=2
   #knum = input("Please input pageNumber:\n")
   print '打印到第'+  str(k) +'页'
    
   website = "http://www.shixiseng.com/interns?p="  + str(k)
   print website

   request = urllib2.Request(website)  
   response = urllib2.urlopen(request)  
   html = response.read() 
   
   #html = getHtml(website)
   #print html
   
   num=num2
   for i in range(len(getImg(html))):
     print getImg(html)[i].decode("utf-8")
     #myfile = open("haha.txt","a")
     #myfile.write(getImg(html)[i].decode("utf-8").encode('utf-8') + '\n\n')  
    
     ws.write(num,0,getImg(html)[i].decode("utf-8"))
     num+=1


   num=num2
   for i in range(len(getCompany(html))):
       #print getCompany(html)[i].decode("utf-8")
       ws.write(num,1,getCompany(html)[i].decode("utf-8"))
       num+=1

   num=num2
   for i in range(len(getPlace(html))):
       #print getPlace(html)[i].decode("utf-8")
       ws.write(num,2,getPlace(html)[i].decode("utf-8"))
       num+=1

   num=num2
   for i in range(len(getSalary(html))):
       #print getSalary(html)[i].decode("utf-8")
       ws.write(num,3,getSalary(html)[i].decode("utf-8"))
       num+=1
   num=num2
   for i in range(len(getDay(html))):
       #print getDay(html)[i].decode("utf-8")
       ws.write(num,4,getDay(html)[i].decode("utf-8"))
       num+=1
   num=num2
   for i in range(len(getUpdate(html))):
       #print getUpdate(html)[i].decode("utf-8")
       ws.write(num,5,getUpdate(html)[i].decode("utf-8"))
       num+=1
   num2 = num

#myfile.close()
w.save('output.xls')     #保存



    

#print getImg(html)
#print getImg(html).encode('utf-8')


"""
for i in range(len(getImg(html))):
    print getImg(html)[i]


for j in range(len(getPlace(html))):
    print getPlace(html)[j]

for j in range(len(getSalary(html))):
    print getSalary(html)[j]


for j in range(len(getDay(html))):
    print getDay(html)[j].decode("utf-8")

for j in range(len(getCompany(html))):
    print getCompany(html)[j].decode("utf-8")

for j in range(len(getUpdate(html))):
    print getUpdate(html)[j].decode("utf-8")
"""

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值