python urllib2用url爬前程无忧职位信息

最新推荐文章于 2019-03-20 00:26:10 发布

杀手binsen

最新推荐文章于 2019-03-20 00:26:10 发布

阅读量306

点赞数

分类专栏： python爬虫

python爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

# coding:utf-8
__author__ = "carry"

import urllib2,re,sys
# reload(sys)
# sys.setdefaultencoding('utf-8')
# 获取原码
def get_content(page):
    headers = {#'Host':'search.51job.com',
               'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
               #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               #'Connection':'keep-alive'
                }
    url ='http://search.51job.com/list/000000,000000,0000,00,9,99,测试,2,'+ str(page) + '.html'
    req = urllib2.Request(url,headers = headers) # 如果提交表单，需要urllib.urlencode
    r = urllib2.urlopen(req)
    # print r.geturl()
    # print r.getcode()
    response = r.read() # 读取源代码并转为unicode
    return response

def get(html):
    reg = re.compile('class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title\
="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S) # 匹配换行符,不换行时最好用r不转义
    items = re.findall(reg,html) # reg.findall(html)
    return items  # 返回的是一个列表，列表里面是匹配的结果形成的元组形式
kong = []

# 多页处理，下载到文件
for j in range(1,11):
    print "正在爬取第 %s 页数据..." % j
    html1 = get_content(j) # 调用获取网页源码
    for i in get(html1):
        tiqu = i[0] + '\t' + i[1] + '\t'+ i[2] + '\t' + i[3] + '\t' + i[4] # \t 制表符
        # print tiqu.decode('gbk')
        kong.append(tiqu)
with open (r'C:\Users\bin.sun\Desktop\51.txt','a') as f:
    for index,x in enumerate(kong,start = 1):
        index1 = '%03d' % index
        shuju = index1 + ' ' + x + '\n'
        f.write(shuju)

"""
for x in get(get_content(2))[1]:
    print x.decode('gbk') # 输出的是列表，打印会显示ascii码，取出来就是中文了, .encode('utf-8')
print len(get(get_content(2)))
"""