这里是爬取前程无忧的数据,需要用的知识点是lxml,selenium模块的学习,这里只是简单的爬取了一页数据,还需后面更新。
import re
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class JobSpider(object):
driver = webdriver.Chrome()
def __init__(self):
self.url = 'https://search.51job.com/list/040000%252C020000,000000,0000,00,9,99,pyhton,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
def run(self): ##运行入口
self.driver.get(self.url)
source = self.driver.page_source
self.select_page(source)
time.sleep(2)
self.driver.quit()
def select_page(self,source): ##获取主页面所有详情页网页链接
page_html = etree.HTML(source)
self.parse_home(page_html)
htmls = page_html.xpath('/html/body/div[2]/div[4]/div/p/span/a/@href') ##获取详情页数据(工作地点,薪资,发布时间,但职位名和公司名没有获取,我是在详情页获取的)
for html in htmls:
self.requests_page(html)
time.sleep(2)
def parse_home(self,page_html): ###解析主页
html_str = etree.tostring(page_html,encoding='utf-8').decode('utf-8')
contents = re.findall(r'<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',html_str,re.DOTALL) ##获取主页工作地点,薪资,发布时间
# for content in contents:
# print(content)
def requests_page(self,html): ##获取详情页面的网页
self.driver.get(html)
source = self.driver.page_source
self.parse_page(source)
def parse_page(self,source): ##解析详情页面的数据
html = etree.HTML(source)
htmls = etree.tostring(html, encoding='utf-8').decode('utf-8') ##html格式修改为字符串格式,才能使用正则表达式
# job_names = html.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title') ##职位
# companys = html.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title') ##公司名字
yaoqius = re.findall(r'<div class="bmsg job_msg inbox">(.*?)<div class="mt10">',htmls,re.DOTALL) ##工作要求
# for yaoqiu in yaoqius:
# s = re.sub('<.*?>','',yaoqiu)
# print(s)
if __name__ == '__main__':
spider = JobSpider()
spider.run()