Python爬虫:抓取智联岗位信息保存csv

 
from urllib.parse import urlencode
import requests
import re
import csv
from tqdm import tqdm
from lxml import etree

# 获取一页的HTMl
def get_one_page(city, keyword, region, page):
    params = {
        'jl': '北京',
        'kw': 'python工程师',
        'sm': 0,
        'isfilter': 1,
        'p': 1,
        're': 2005
    }
    url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(params)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        'Referer': 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=1',
        'Host': 'sou.zhaopin.com'
    }
    response = requests.get(url, headers=headers)
    html = response.text
    return html

# 职位名称、公司名称、公司详情页地址、职位月薪
def parse_one_page(html):
    # 定义一个列表,保存字典信息
    bbb = []
    s = etree.HTML(html)
    for i in range(2, 11):
        job = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[1]/div/a/text()')[0]
        company = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[3]/a[1]/text()')[0]
        website = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[3]/a[1]/@href')[0]
        salary = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[4]/text()')[0]
        # 定义字典,保存各项信息
        aaa = {}
        aaa['job'] = job
        aaa['company'] = company
        aaa['website'] = website
        aaa['salary'] = salary
        # 把字典添加到列表里
        bbb.append(aaa)
    write_csv_rows(filename, headers, bbb)

def write_csv_headers(path, headers):
    '''
    写入表头
    '''
    with open(path, 'a', encoding='utf_8_sig', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writeheader()

def write_csv_rows(path, headers, rows):
    '''
    写入行
    '''
    with open(path, 'a', encoding='utf_8_sig', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writerows(rows)

if __name__ == '__main__':
    filename = 'data.csv'
    # 表头名称注意跟字典键名称一致
    headers = ['job', 'website', 'company', 'salary']
    # 写入表头
    write_csv_headers(filename, headers)
    parse_one_page(get_one_page('北京', 'python工程师', 2005, 1))
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值