from urllib.parse import urlencode import requests import re import csv from tqdm import tqdm from lxml import etree # 获取一页的HTMl def get_one_page(city, keyword, region, page): params = { 'jl': '北京', 'kw': 'python工程师', 'sm': 0, 'isfilter': 1, 'p': 1, 're': 2005 } url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(params) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python%E5%B7%A5%E7%A8%8B%E5%B8%88&sm=0&p=1', 'Host': 'sou.zhaopin.com' } response = requests.get(url, headers=headers) html = response.text return html # 职位名称、公司名称、公司详情页地址、职位月薪 def parse_one_page(html): # 定义一个列表,保存字典信息 bbb = [] s = etree.HTML(html) for i in range(2, 11): job = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[1]/div/a/text()')[0] company = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[3]/a[1]/text()')[0] website = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[3]/a[1]/@href')[0] salary = s.xpath('//*[@id="newlist_list_content_table"]/table['+str(i)+']/tr[1]/td[4]/text()')[0] # 定义字典,保存各项信息 aaa = {} aaa['job'] = job aaa['company'] = company aaa['website'] = website aaa['salary'] = salary # 把字典添加到列表里 bbb.append(aaa) write_csv_rows(filename, headers, bbb) def write_csv_headers(path, headers): ''' 写入表头 ''' with open(path, 'a', encoding='utf_8_sig', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writeheader() def write_csv_rows(path, headers, rows): ''' 写入行 ''' with open(path, 'a', encoding='utf_8_sig', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writerows(rows) if __name__ == '__main__': filename = 'data.csv' # 表头名称注意跟字典键名称一致 headers = ['job', 'website', 'company', 'salary'] # 写入表头 write_csv_headers(filename, headers) parse_one_page(get_one_page('北京', 'python工程师', 2005, 1))