# -*- coding:utf-8 -*-
import re
import urllib
import urllib2
import time
from pyExcelerator import *
"""
User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
header = {}
header['User-Agent'] = User_Agent
"""
#headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1) Chrome/29.0.1547.66')
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getHtml(url):
req = urllib2.Request(url,headers=headers)
res = urllib2.urlopen(req).read()
return res
#岗位
def getImg(html):
#reg = r'src="(.+?\.jpg)" pic_ext'
reg = r'title="(.*?)" target="_blank">'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
#地点
def getPlace(html):
reg = re.compile('<i class="addr">.*?b;</i><span>(.*?)</span>',re.S)
placere = re.compile(reg)
placelist = re.findall(placere,html)
return placelist
#薪资
def getSalary(html):
#reg = r'src="(.+?\.jpg)" pic_ext'
#reg = r'title="(.*?)" target="_blank">'
reg = re.compile('<span class="money_box">.*?b;</i>(.*?)\s\s</span>',re.S)
salaryre = re.compile(reg)
salarylist = re.findall(salaryre,html)
return salarylist
#实习时间
def getDay(html):
#reg = r'src="(.+?\.jpg)" pic_ext'
reg = re.compile('<span class="day_box">.*?a;</i>(.*?)\s</span>',re.S)
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
#公司
def getCompany(html):
#reg = r'src="(.+?\.jpg)" pic_ext'
reg = re.compile('class="company_name" target="_blank" title="(.*?)"',re.S)
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
#发布时间
def getUpdate(html):
#reg = r'src="(.+?\.jpg)" pic_ext'
reg = re.compile('<i class="job_time">.*?c;</i>(.*?)\s</span>',re.S)
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
#html = getHtml("http://www.shixiseng.com/interns?p=1")
w = Workbook() #创建一个工作簿
ws = w.add_sheet('results') #创建一个工作表
ws.write(0,0,u'岗位')
ws.write(0,1,u'公司')
ws.write(0,2,u'地点')
ws.write(0,3,u'薪资')
ws.write(0,4,u'实习时间')
ws.write(0,5,u'发布时间')
num2 = 1 #每页打印起始行
for k in range(1,10):
time.sleep(2)
#k=2
#knum = input("Please input pageNumber:\n")
print '打印到第'+ str(k) +'页'
website = "http://www.shixiseng.com/interns?p=" + str(k)
print website
request = urllib2.Request(website)
response = urllib2.urlopen(request)
html = response.read()
#html = getHtml(website)
#print html
num=num2
for i in range(len(getImg(html))):
print getImg(html)[i].decode("utf-8")
#myfile = open("haha.txt","a")
#myfile.write(getImg(html)[i].decode("utf-8").encode('utf-8') + '\n\n')
ws.write(num,0,getImg(html)[i].decode("utf-8"))
num+=1
num=num2
for i in range(len(getCompany(html))):
#print getCompany(html)[i].decode("utf-8")
ws.write(num,1,getCompany(html)[i].decode("utf-8"))
num+=1
num=num2
for i in range(len(getPlace(html))):
#print getPlace(html)[i].decode("utf-8")
ws.write(num,2,getPlace(html)[i].decode("utf-8"))
num+=1
num=num2
for i in range(len(getSalary(html))):
#print getSalary(html)[i].decode("utf-8")
ws.write(num,3,getSalary(html)[i].decode("utf-8"))
num+=1
num=num2
for i in range(len(getDay(html))):
#print getDay(html)[i].decode("utf-8")
ws.write(num,4,getDay(html)[i].decode("utf-8"))
num+=1
num=num2
for i in range(len(getUpdate(html))):
#print getUpdate(html)[i].decode("utf-8")
ws.write(num,5,getUpdate(html)[i].decode("utf-8"))
num+=1
num2 = num
#myfile.close()
w.save('output.xls') #保存
#print getImg(html)
#print getImg(html).encode('utf-8')
"""
for i in range(len(getImg(html))):
print getImg(html)[i]
for j in range(len(getPlace(html))):
print getPlace(html)[j]
for j in range(len(getSalary(html))):
print getSalary(html)[j]
for j in range(len(getDay(html))):
print getDay(html)[j].decode("utf-8")
for j in range(len(getCompany(html))):
print getCompany(html)[j].decode("utf-8")
for j in range(len(getUpdate(html))):
print getUpdate(html)[j].decode("utf-8")
"""
爬虫小程序(实习僧网抓取数据)
最新推荐文章于 2021-11-22 22:09:15 发布