记录几个抓取网页的实例
1、抓取51job上的根据关键字搜索出来的招聘信息
import os
import time
import Queue
import urllib2
import datetime
import threading
from bs4 import BeautifulSoup
from DBCUtils import DBOperation
class Job:
def setJobName(self, jobName):
self._jobName = jobName
def getJobName(self):
return self._jobName
def setJobUrl(self, jobUrl):
self._jobUrl = jobUrl
def getJobUrl(self):
return self._jobUrl
def setJobDesc(self, jobDesc):
self._jobDesc = jobDesc
def getJobDesc(self):
return self._jobDesc
def setCompanyName(self, companyName):
self._companyName = companyName
def getCompanyName(self):
return self._companyName
def setCompanyUrl(self, companyUrl):
self._companyUrl = companyUrl
def getCompanyUrl(self):
return self._companyUrl
def setCompanyDesc(self, companyDesc):
self._companyDesc = companyDesc
def getCompanyDesc(self):
return self._companyDesc
def setSalary(self, salary):
self._salary = salary
def getSalary(self):
return self._salary
def setWelfare(self, welfare):
self._welfare = welfare
def getWelfare(self):
return self._welfare
def setWorkPlace(self, workPlace):
self._workPlace = workPlace
def getWorkPlace(self):
return self._workPlace
def setContact(self, contact):
self._contact = contact
def getContact(self):
return self._contact
def setIssueDate(self, issueDate):
self._issueDate = issueDate
def getIssueDate(self):
return self._issueDate
def setUpdateDate(self, updateDate):
self._updateDate = updateDate
def getUpdateDate(self):
return self._updateDate
def setState(self, state):
self._state = state
def getState(self):
return self._state
def insert(self):
sql = 'insert into 51job_info(job_name, job_url, job_desc, \
company_name, company_url, company_desc, salary, \
work_place, contact, issue_date, update_date, state) \
values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
DBOperation.execute(sql, [self._jobName, self._jobUrl, self._jobDesc,
self._companyName, self._companyUrl, self._companyDesc, self._salary,
self._workPlace, self._contact, self._issueDate, self._updateDate, self._state])
def insertBasic(self):
sql = 'insert into 51job_info(job_name, job_url, company_name, \
company_url, salary, work_place, update_date, state) \
values(%s, %s, %s, %s, %s, %s, %s, %s)'
DBOperation.execute(sql, [self._jobName, self._jobUrl,
self._companyName, self._companyUrl, self._salary,
self._workPlace, self._updateDate, self._state])
def updateDesc(self):
sql = 'update 51job_info set welfare = %s, job_desc = %s, \
company_desc = %s, contact = %s, update_date = %s, \
state = 2 where job_url = %s'
DBOperation.execute(sql, [self._welfare, self._jobDesc,
self._companyDesc, self._contact, self._updateDate, self._jobUrl])
class JobCrawlMain:
def run(self):
jobUrlCrawlController = JobUrlCrawlController()
url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=090200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=$keyword$&keywordtype=0&curr_page=$curr_page$&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9'
keyword = 'java%E5%B7%A5%E7%A8%8B%E5%B8%88'
keyword_url = url.replace('$keyword$', keyword)
for i in range(10) :
search_url = keyword_url.replace('$curr_page$', str(i + 1))
req = urllib2.Request(search_url)
req.add_header('Content-Type', 'application/x-www-form-urlencoded')
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36')
#Header中必须由此Cookie属性,否则抓取页面数据不正确
req.add_header('Cookie', 'guid=xxxxxxxxxxxxxxx; 51job=cenglish%3D0; guide=1; search=jobarea%7E%60090200%7C%21ord_field%7E%600%7C%21list_type%7E%600%7C%21recentSearch0%7E%602%A1%FB%A1%FA090200%2C00%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FAjava%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA-1%A1%FB%A1%FA1458700297%A1%FB%A1%FA0%A1%FB%A1%FA%7C%21recentSearch1%7E%602%A1%FB%A1%FA090200%2C00%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA-1%A1%FB%A1%FA1458704261%A1%FB%A1%FA0%A1%FB%A1%FA%7C%21')
#req.add_header('Referer', 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=090200%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword=java%E5%B7%A5%E7%A8%8B%E5%B8%88&keywordtype=0&curr_page=3&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9')
resp = urllib2.urlopen(req)
#content = resp.read()
#print str(content).decode('gb2312')
#resp = urllib2.urlopen(url)
html = BeautifulSoup(resp, from_encoding='gb2312')
divs = html.find_all('div', class_ = 'el')
for div in divs :
if not div is None :
#p = div.find(class_ = 't1')
pas = div.select('p a')
if not pas is None and len(pas) != 0 :
job = Job()
for pa in pas :
if pa.has_attr('title'):
jobUrl = str(pa['href']).encode('utf-8')
print jobUrl
job.setJobUrl(jobUrl)
jobName = str(pa['title']).encode('utf-8')
print jobName
job.setJobName(jobName)
sas = div.select('.t2 a')
for sa in sas :
companyUrl = str(sa['href']).encode('utf-8')
#print companyUrl
job.setCompanyUrl(companyUrl)
companyName = str(sa['title']).encode('utf-8')
print companyName
job.setCompanyName(companyName)
s3 = div.find(class_ = 't3')
workPlace = str(s3.get_text()).encode('utf-8')
#print workPlace
job.setWorkPlace(workPlace)
s4 = div.find(class_ = 't4')
salary = str(s4.get_text()).encode('utf-8')
#print salary
job.setSalary(salary)
s5 = div.find(class_ = 't5')
issueDate = str(s5.get_text()).encode('utf-8')
print issueDate
#job.setIssueDate(s5.get_text())
job.setUpdateDate(datetime.datetime.now())
job.setState(1)
job.insertBasic()
jobUrlCrawlController.addTaskQueue(jobUrl)
time.sleep(30)
jobUrlCrawlController.waitForComplete()
class JobUrlCrawl:
def __init__(self, jobUrl):
self._jobUrl = jobUrl
def run(self):
try :
job = Job()
job.setJobUrl(self._jobUrl)
req = urllib2.Request(self._jobUrl)
resp = urllib2.urlopen(req)
html = BeautifulSoup(resp, from_encoding = 'gb2312')
welfareP = html.find('p', {'class' : 't2'})
welfare = str(welfareP.get_text()).encode('utf-8').strip()
job.setWelfare(welfare)
jobDescDiv = html.find('div', {'class' : 'bmsg job_msg inbox'})
jobDesc = str(jobDescDiv.get_text()).encode('utf-8').strip()
job.setJobDesc(jobDesc)
contactDiv = html.find('div', {'class' : 'bmsg inbox'})
contact = str(contactDiv.get_text()).encode('utf-8').strip()
job.setContact(contact)
companyDescDiv = html.find('div', {'class' : 'tmsg inbox'})
companyDesc = str(companyDescDiv.get_text()).encode('utf-8').strip()
job.setCompanyDesc(companyDesc)
job.setUpdateDate(datetime.datetime.now())
job.updateDesc()
except Exception, e :
print str(e)
class JobUrlCrawlThread(threading.Thread):
def __init__(self, taskQueue):
threading.Thread.__init__(self)
self._taskQueue = taskQueue
self.setDaemon(True)
self.start()
def run(self):
print 'current thread name %s' %threading.currentThread().name
accumulate = 0
while True :
try :
taskQueueSize = self._taskQueue.qsize()
print 'task queue size %s' %taskQueueSize
while taskQueueSize == 0 :
time.sleep(5)
taskQueueSize = self._taskQueue.qsize()
print 'task queue size %s' %taskQueueSize
accumulate = accumulate + 1
if (accumulate > 5) : break
jobUrl = self._taskQueue.get(False)
jobUrlCrawl = JobUrlCrawl(jobUrl)
jobUrlCrawl.run()
self._taskQueue.task_done()
except Exception, e :
print str(e)
break
class JobUrlCrawlController():
def __init__(self, threadNum = 5):
self._taskQueue = Queue.Queue()
self._threadPool = []
self.__initThreadPool(threadNum)
def __initThreadPool(self, threadNum):
for i in range(threadNum) :
jobUrlCrawlThread = JobUrlCrawlThread(self._taskQueue)
self._threadPool.append(jobUrlCrawlThread)
def addTaskQueue(self, jobUrl):
self._taskQueue.put(jobUrl, False)
#self._taskQueue.put(jobUrl)
def waitForComplete(self):
for threadInstance in self._threadPool :
if threadInstance.isAlive() :
threadInstance.join()
if __name__ == '__main__' :
print 'MySQL Service Start'
os.system('net start MySQL57')
print 'main method run start...'
jobCrawlMain = JobCrawlMain()
jobCrawlMain.run()
print 'main method run end...'
print 'MySQL Service Stop'
os.system('net stop MySQL57')
2、抓取豆瓣电影排行榜信息
import requests
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/chart'
cookie = 'ue="xxxxxxxxxxx@sina.com"; bid="xt+vaLfsYeE"; ll="118414"; __utma=30149280.273243958.1453451325.1453451325.1453451325.1; __utmz=30149280.1453451325.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1461950681%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D7LA3_5-osZKjW5mpc2d-ZbPefoLKrg7zYj9RjCLY813hxSjO_cfU6AHYJwUbXvTw%26wd%3D%26eqid%3Dff57fd8d003894c00000000356f78542%22%5D; push_noty_num=1; push_doumail_num=0; ap=1; _pk_id.100001.8cb4=31efd7e6db62874b.1458793646.4.1461950722.1461946873.; _pk_ses.100001.8cb4=*'
cookies = {'Cookie' : cookie}
resp = requests.get(url, cookies = cookies)
html = BeautifulSoup(resp.text, from_encoding='gb2312')
divTags = html.findAll('div', class_ = 'pl2')
for divTag in divTags:
if not divTag is None :
aTag = divTag.find('a')
mainTitle = str(aTag.get_text())
spanTag = aTag.find('span')
subTitle = str(spanTag.get_text())
title = mainTitle + subTitle
print title
pTag = divTag.find('p')
summary = str(pTag.get_text())
print summary
score = str(divTag.select('.rating_nums')[0].get_text())
print score
3、抓取笑话信息
import urllib2, json, sys, smtplib
from email.mime.text import MIMEText
reload(sys)
#sys.setdefaultencoding('utf-8')#避免中文编码问题
mailto_list=['xxxxxxxxx@qq.com'] #邮件接受者
mail_host="smtp.sina.com" #设置服务器
mail_user="xxxxxxxxxxx" #用户名
mail_pass="xxxxxxxxxxx" #口令
mail_postfix='sina.com'
def send_mail(to_list,part1,sub,content):
#to_list:收件人;sub:主题;content:邮件内容;
#me=part1+"<"+mail_user+">" #hello
me = "hello" + "<" + mail_user + "@" + mail_postfix + ">"
msg = MIMEText(content,_subtype='plain',_charset='utf-8')#创建一个实例,这里设置为纯文字格式邮件编码utf8
msg['Subject'] = sub #设置主题
msg['From'] = me #设置发件人
msg['To'] = ";".join(to_list)
try:
s = smtplib.SMTP() #实例化
s.connect(mail_host) #连接smtp服务器
s.login(mail_user, mail_pass) #登陆服务器
s.sendmail(me, to_list, msg.as_string()) #发送邮件
s.close()
return True
except Exception, e:
print str(e)
return False
if __name__ == '__main__':
appkey = "e2376cfbe3b27dff923ed61698839a67"
url = 'http://apis.baidu.com/showapi_open_bus/showapi_joke/joke_text?page=1'
req = urllib2.Request(url)
req.add_header("apikey", appkey)
resp = urllib2.urlopen(req)
content = resp.read()
if(content):
json_result = json.loads(content)
content_list = json_result['showapi_res_body']['contentlist']
minlen = 10000
for item in content_list:
print item['text']
if len(item['text']) < minlen:
first_title = item['title']
first_text = item['text']
minlen = len(item['text'])
print 'title:'+first_title
print 'content:'+first_text
length = len(first_text)
part1 = first_text[0:10]
part2 = first_text[10:22]
part3 = first_text[22:length]
print part1,"+",part2,"+",part3
if send_mail(mailto_list,part1,part2,part3):
print "send msg succeed"
else:
print "send msg failed"
else:
print "get joke error"