代码使用了try,except来排除异常
随机选取代理IP+sleep15秒,模拟人类点击,以避开反爬虫机制
# coding=utf-8 from bs4 import BeautifulSoup import requests import time import random import sys import pandas import MySQLdb def getpage(): pg=1 h1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} o_g=['222.33.192.238:8118','121.61.17.36:8118','113.200.214.164:9999','222.33.192.238:8118'] a=random.randint(0,4) pro={'http': o_g[a]} url='http://sou.zhaopin.com/jobs/searchresult.ashx?in=180000&pd=30&jl=%E4%B8%8A%E6%B5%B7&kw=%E5%9F%BA%E9%87%91&sm=0&sf=0&el=4&isfilter=0&fl=538&isadv=1&sb=1&p='+ str(pg) while pg <10: try: html=requests.get(url,timeout=20,headers=h1,proxies=pro) html.encoding = "utf-8" html = html.text print '抓到' re(html) except : #锁定是哪种异常 xx=raw_input('again?') if xx=='yes': pass else: print 'ERROR INPUT !' print('翻页等15秒钟') time.sleep(15) pg=pg+1 def re(html): try: l1=[] l2=[] l3=[] soup = BeautifulSoup(html,'lxml') con=soup.find_all('a',style="font-weight: bold") for item in con: l1.append(item.get_text()) l2.append(item.attrs['href']) con2=soup.find_all('li',class_="newlist_deatil_two") for item2 in con2: l3.append(item2) print l1 print l2 print l3 for j in range(0, 59): conn= MySQLdb.connect( host='localhost', port = 3306, user='root', passwd='******', db ='zlzp', charset='utf8' ) cur = conn.cursor() cur.execute("insert into zlzp VALUES (NULL,'%s','%s','%s')"%(l1[j],l2[j],l3[j])) cur.close() conn.commit() conn.close() print("成功") except: print("重新解析") re(html) getpage()