# coding:utf-8 __author__ = "carry" import urllib2,re,sys # reload(sys) # sys.setdefaultencoding('utf-8') # 获取原码 def get_content(page): headers = {#'Host':'search.51job.com', 'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Connection':'keep-alive' } url ='http://search.51job.com/list/000000,000000,0000,00,9,99,测试,2,'+ str(page) + '.html' req = urllib2.Request(url,headers = headers) # 如果提交表单,需要urllib.urlencode r = urllib2.urlopen(req) # print r.geturl() # print r.getcode() response = r.read() # 读取源代码并转为unicode return response def get(html): reg = re.compile('class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title\ ="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S) # 匹配换行符,不换行时最好用r不转义 items = re.findall(reg,html) # reg.findall(html) return items # 返回的是一个列表,列表里面是匹配的结果形成的元组形式 kong = [] # 多页处理,下载到文件 for j in range(1,11): print "正在爬取第 %s 页数据..." % j html1 = get_content(j) # 调用获取网页源码 for i in get(html1): tiqu = i[0] + '\t' + i[1] + '\t'+ i[2] + '\t' + i[3] + '\t' + i[4] # \t 制表符 # print tiqu.decode('gbk') kong.append(tiqu) with open (r'C:\Users\bin.sun\Desktop\51.txt','a') as f: for index,x in enumerate(kong,start = 1): index1 = '%03d' % index shuju = index1 + ' ' + x + '\n' f.write(shuju) """ for x in get(get_content(2))[1]: print x.decode('gbk') # 输出的是列表,打印会显示ascii码,取出来就是中文了, .encode('utf-8') print len(get(get_content(2))) """
python urllib2用url爬前程无忧职位信息
最新推荐文章于 2019-03-20 00:26:10 发布