Python爬虫——4.5urllib2和xpath爬取前程无忧网招聘信息

最新推荐文章于 2024-04-01 15:59:06 发布

一杯海风

最新推荐文章于 2024-04-01 15:59:06 发布

阅读量1.1k

点赞数

分类专栏：基础篇

本文链接：https://blog.csdn.net/liyahui_3163/article/details/79060903

版权

基础篇专栏收录该内容

47 篇文章 1 订阅

订阅专栏

# coding:utf-8
'''
使用urllib2模块进行数据采集，使用XPath进行数据筛选,使用随机免费代理
'''
# 引入需要的模块
import urllib2
from lxml import etree
import random



# 免费代理列表
proxy_list = [{"http":"116.8.83.3:8118"}, {"http":"116.8.83.3:8118"},
              {"http":"113.89.59.161:8118"}, {"http":"113.67.183.196:8118"},
              {"http":"180.155.135.224:31425"}, {"http":"123.161.153.238:22593"}]
# 随机选择代理
proxy_ip = random.choice(proxy_list)
# 请输入要爬取的页数
nums=input("请输入要爬取的页数:")
for num in range(1,nums+1):
    url = 'http://search.51job.com/list/170200,000000,0000,00,9,99,%2B,2,+'+str(num)+'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='

    my_header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    # 定义请求对象
    request=urllib2.Request(url,headers=my_header)

    # # 自定义代理操作对象
    # proxy_hanlder=urllib2.ProxyHandler(proxy_ip)
    #
    # # 自定义opener对象
    # proxy_opener=urllib2.build_opener(proxy_hanlder)
    #
    # # 发送请求响应
    # response=proxy_opener.open(request)

    # 发送请求
    response=urllib2.urlopen(request)
    response.encoding='gbk'
    html=etree.HTML(response.read())

    name_list=html.xpath("//div[@id='resultList']/div[@class='el']/p[@class='t1 ']/span")

    names_list=[]
    for name in name_list:
        new_name= name.xpath('string(.)')
        names_list.append(new_name)
    company_list=html.xpath("//div[@class='el']/span[@class='t2']")
    companys_list=[]
    for company in company_list:
        new_company = company.xpath('string(.)').strip()
        companys_list.append(new_company)

    month_list=html.xpath("/html/body/div[@class='dw_wp']/div[@id='resultList']/div[@class='el']/span[@class='t3']")
    months_list=[]
    for month in month_list:
        new_month = month.xpath('string(.)').strip()
        months_list.append(new_month)

    time_list=html.xpath("/html/body/div[@class='dw_wp']/div[@id='resultList']/div[@class='el']/span[@class='t5']")
    times_list=[]
    for time in time_list:
        new_time = time.xpath('string(.)').strip()

        times_list.append(new_time)

    f=open('qcwy.txt',"a")
    f.write("第%s页内容如下:"%num+"\r\n")
    f.write("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2222")
    for i in range(0,len(names_list)):
        info=names_list[i]+"|"+companys_list[i]+"|"+months_list[i]+"|"+times_list[i]+"\r\n"
        print info
        f.write(info.encode('utf-8'))
    f.write("第%s页爬取完毕"%num+"\r\n")
    f.write("#############################################################################################"+"\r\n")
    f.close()

print ("爬取完毕")