使用Idea 爬数据 ,idea集成python环境自行查找。
这是项目目录文件,主要集成request 方式的 反扒机制处理和构建代理池并验证和高并发的验证。
from urllib import request from bs4 import BeautifulSoup from fake_useragent import FakeUserAgent import re import time from thead_pool import thread_pool def chekout_proxy(ip): ip = {'http': ip} proxy = request.ProxyHandler(ip) opener = request.build_opener(proxy) ua = FakeUserAgent() url = 'http://movie.douban.com/' headinfo = {'User-Agent': ua.random} reqhd = request.Request(url, headers=headinfo) try: req = opener.open(reqhd, timeout=3) except Exception as e: #print ('invalid ip:', ip, e) return if req.code == 200: print ('valid ip:', ip) return ip class GetProxy(object): def __init__(self, url = ''): self.baseurl = url self.ua = FakeUserAgent() self.pools = [] def getIps(self): return self.pools def getByApi(self, url): content = self.reqPage(url) if content: obj = BeautifulSoup(content, 'lxml') listip = [item for item in obj.stripped_strings if re.match(r'\d', item)] self.pools.extend(listip) def getCharset(self, content): scon = str(content) meta = re.search(r'<meta(.*?)content-type(.*?)>', scon, re.I) if meta: s = meta.group() m = re.search(r'charset=(.*?)[\"\' /]', s, re.I) if m: charset = m.groups()[0] return charset return 'utf-8' def reqPage(self, url): time.sleep(2) headinfo = {'UserAgent': self.ua.random} reqhd = request.Request(url, headers=headinfo) try: req = request.urlopen(reqhd) except Exception as e: print ('Error:', e) if req.code != 200: return con = req.read() charset = self.getCharset(con) print (charset) try: con = con.decode(charset) except Exception as e: print ('decode Error:', e) return return con def parsePage(self, url): con = self.reqPage(url) obj = BeautifulSoup(con, 'lxml') div = obj.find('div', class_="containerbox boxindex") tbody = div.find('tbody') listtr = tbody.find_all('tr') for tr in listtr[1:]: tds = list(tr.stripped_strings) ip = ':'.join(tds[:2]) print (ip) self.pools.append(ip) def parseArea(self, url): print (url) con = self.reqPage(url) obj = BeautifulSoup(con, 'lxml') listpage = obj.find('div', id="PageList") lista = listpage.find_all('a') for a in lista[:6]: step = a.get('href') if step.endswith('/index'): step = step.replace('/index', '/1.html') self.parsePage(self.baseurl+step) def start(self): con = self.reqPage(self.baseurl) obj = BeautifulSoup(con, 'lxml') areas = obj.find('ul', class_="textlarge22") if areas: lista = areas.find_all('a') if lista: lista = lista[1:] for a in lista: step = a.get('href') if step: self.parseArea(self.baseurl + step) break if __name__ == '__main__': apiurl = 'http://www.66ip.cn/mo.php?sxb=&tqsl=2000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=' starturl ='http://www.66ip.cn' proxyhd = GetProxy(url = starturl) tpools = thread_pool(50) #proxyhd.start() proxyhd.getByApi(apiurl) ips = proxyhd.getIps() print (len(ips)) validips = [] for ip in ips: tpools.add_task(chekout_proxy, ip) stime = time.time() tpools.start() tpools.join() etime = time.time() rs = tpools.get_result() print ('valid ips:',len(rs)) for ip in rs: print (ip) print (stime, etime) ###########################
from urllib import request from bs4 import BeautifulSoup from fake_useragent import FakeUserAgent import re import time from thead_pool import thread_pool def chekout_proxy(ip): ip = {'http': ip} proxy = request.ProxyHandler(ip) opener = request.build_opener(proxy) ua = FakeUserAgent() url = 'http://movie.douban.com/' headinfo = {'User-Agent': ua.random} reqhd = request.Request(url, headers=headinfo) try: req = opener.open(reqhd, timeout=3) except Exception as e: #print ('invalid ip:', ip, e) return if req.code == 200: print ('valid ip:', ip) return ip class GetProxy(object): def __init__(self, url = ''): self.baseurl = url self.ua = FakeUserAgent() self.pools = [] def getIps(self): return self.pools def getByApi(self, url): content = self.reqPage(url) if content: obj = BeautifulSoup(content, 'lxml') listip = [item for item in obj.stripped_strings if re.match(r'\d', item)] self.pools.extend(listip) def getCharset(self, content): scon = str(content) meta = re.search(r'<meta(.*?)content-type(.*?)>', scon, re.I) if meta: s = meta.group() m = re.search(r'charset=(.*?)[\"\' /]', s, re.I) if m: charset = m.groups()[0] return charset return 'utf-8' def reqPage(self, url): time.sleep(2) headinfo = {'UserAgent': self.ua.random} reqhd = request.Request(url, headers=headinfo) try: req = request.urlopen(reqhd) except Exception as e: print ('Error:', e) if req.code != 200: return con = req.read() charset = self.getCharset(con) print (charset) try: con = con.decode(charset) except Exception as e: print ('decode Error:', e) return return con def parsePage(self, url): con = self.reqPage(url) obj = BeautifulSoup(con, 'lxml') div = obj.find('div', class_="containerbox boxindex") tbody = div.find('tbody') listtr = tbody.find_all('tr') for tr in listtr[1:]: tds = list(tr.stripped_strings) ip = ':'.join(tds[:2]) print (ip) self.pools.append(ip) def parseArea(self, url): print (url) con = self.reqPage(url) obj = BeautifulSoup(con, 'lxml') listpage = obj.find('div', id="PageList") lista = listpage.find_all('a') for a in lista[:6]: step = a.get('href') if step.endswith('/index'): step = step.replace('/index', '/1.html') self.parsePage(self.baseurl+step) def start(self): con = self.reqPage(self.baseurl) obj = BeautifulSoup(con, 'lxml') areas = obj.find('ul', class_="textlarge22") if areas: lista = areas.find_all('a') if lista: lista = lista[1:] for a in lista: step = a.get('href') if step: self.parseArea(self.baseurl + step) break if __name__ == '__main__': apiurl = 'http://www.66ip.cn/mo.php?sxb=&tqsl=2000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=' starturl ='http://www.66ip.cn' proxyhd = GetProxy(url = starturl) tpools = thread_pool(50) #proxyhd.start() proxyhd.getByApi(apiurl) ips = proxyhd.getIps() print (len(ips)) validips = [] for ip in ips: tpools.add_task(chekout_proxy, ip) stime = time.time() tpools.start() tpools.join() etime = time.time() rs = tpools.get_result() print ('valid ips:',len(rs)) for ip in rs: print (ip) print (stime, etime) ##########################
import threading from threading import Thread from queue import Queue class thread_pool(object): def __init__(self, num=5): self.qtask = Queue() self.qresult = Queue() self.tnum = num self.tlist = [] def task(self): while True: try: func, args, kwargs = self.qtask.get(timeout=1) result = func(*args, **kwargs) if result: self.qresult.put(result) except: print ('no task in poll') break def start(self): for i in range(self.tnum): t = Thread(target=self.task) self.tlist.append(t) t.start() def add_task(self, func, *args, **kwargs): self.qtask.put((func, args, kwargs)) def join(self): for t in self.tlist: t.join() def get_result(self): rlist = [] for i in range(self.qresult.qsize()): r = self.qresult.get() rlist.append(r) return rlist def testfunc(num, **kwargs): print ('call test func', num, kwargs) return str(num) if __name__ == '__main__': tpool = thread_pool(4) for i in range(40): tpool.add_task(testfunc, i) tpool.start() tpool.join() rs = tpool.get_result() for r in rs: print (r) #############################
# coding:utf-8 import re from urllib import request from bs4 import BeautifulSoup from urllib import error from fake_useragent import FakeUserAgent import time from minfo_save import mysqlHandler base_url = 'https://movie.douban.com/tag/2010' class spiderDouban(object): def __init__(self, url=None, ipq = None, savehd=None): self.starturl = url self.infohd = savehd self.ua = FakeUserAgent() self.ipqueue = ipq self.ips = [] self.opener = None self.reqnum = 0 self.iterips = None self.curip = None def getipfrom_ips(self): if self.ipqueue: while len(self.ips) < 10: try: ip = self.ipqueue.get(timeout = 1) self.ips.append(ip) except: print ('no proxy') break def start_download(self): url = self.starturl self.getipfrom_ips() while url: print('pageurl:', url) url = self.load_page(url) for ip in self.ips: self.ipqueue.put(ip) def get_proxyip(self): if self.iterips == None: self.iterips = iter(self.ips) try: ip = next(self.iterips) return ip except: if self.ips: self.getipfrom_ips() self.iterips = iter(self.ips) ip = next(self.iterips) return ip def change_proxy(self): ip = self.get_proxyip() if ip: proxyhd = request.ProxyHandler(ip) print (ip) self.opener = request.build_opener(proxyhd) self.curip = ip return True return False def req_page(self, url): req = None if self.reqnum % 10 == 0: self.change_proxy() while True: try: headinfo = {'User-Agent':self.ua.random} reqhd = request.Request(url, headers=headinfo) req = self.opener.open(reqhd,timeout=5) self.reqnum += 1 break except Exception as e: print('catch e:', e) self.ips.remove(self.curip) self.curip = None if not self.change_proxy(): return None if req.code != 200: return pageinfo = req.read().decode('utf-8') return pageinfo def parse_text(self, minfo): # listt = minfo.split('\n') print(minfo) listt = [item.strip() for item in minfo.split('\n') if item.strip(' ')] listt = [item.split(':', 1) for item in listt] listt = [items for items in listt if len(items) == 2 and items[0].strip() and items[1].strip()] print(listt) dinfo = dict(listt) return dinfo def parse_minfo(self, url, mname): pinfo = self.req_page(url) if not pinfo: return obj = BeautifulSoup(pinfo, 'lxml') minfo = obj.find('div', id='info') tinfo = minfo.get_text() dinfo = self.parse_text(tinfo) mscore = obj.find('div', class_='rating_self clearfix') score = mscore.find(property="v:average").get_text() votes = mscore.find(property="v:votes").get_text() dinfo['score'] = score dinfo['votes'] = votes dinfo['name'] = mname print(dinfo.keys()) for item in dinfo.items(): print(item) return dinfo def find_nextpage(self, obj): nexturl = None if obj: nextpage = obj.find('span', class_="next") if nextpage: a = nextpage.find('a') if a: nexturl = a.get('href') return nexturl def load_page(self, url): pinfo = self.req_page(url) if not pinfo: return obj = BeautifulSoup(pinfo, 'lxml') items = obj.find_all('tr', class_="item") for item in items: a = item.find('a') murl = a.get('href') mname = a.get('title') print(murl, mname) minfo = self.parse_minfo(murl, mname) if minfo and self.infohd: keys = ['name', '导演', '主演', '类型', '制片国家/地区', '语言', '上映日期', '片长', '又名', 'score', 'votes'] self.infohd.write(keys, minfo) return self.find_nextpage(obj) def load_img(self, info): imgreq = request.urlopen(info[1]) img_c = imgreq.read() imgf = open('F:\\test\\' + info[0] + '.jpg', 'wb') imgf.write(img_c) imgf.close() #sql = mysqlHandler('localhost', 'root', 'abcd1234', 'test_db1', 'mvinfo') if __name__ == '__main__': spider = spiderDouban(base_url, ipq = None, savehd=None) spider.start_download()
######################
#coding:utf-8 from collect_proxy import GetProxy, chekout_proxy from thead_pool import thread_pool from spider_dbinfo import spiderDouban from queue import Queue from threading import Thread import time from minfo_save import mysqlHandler gQuit = False def getProxy(): apiurl = 'http://www.66ip.cn/mo.php?sxb=&tqsl=2000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=' starturl = 'http://www.66ip.cn' proxyhd = GetProxy(url=starturl) tpools = thread_pool(20) proxyhd.start() proxyhd.getByApi(apiurl) ips = proxyhd.getIps() print(len(ips)) validips = [] for ip in ips: tpools.add_task(chekout_proxy, ip) stime = time.time() tpools.start() tpools.join() rs = tpools.get_result() return rs 原载 一个收费视频知识 def getProxyFromWeb(ipqueue): print(ipqueue) ips = getProxy() print(len(ips)) for ip in ips: ipqueue.put(ip) if __name__ == '__main__': ipqueue = Queue() getProxyFromWeb(ipqueue) #startSpider(ipqueue) while gQuit == False: print ('gQuit:', gQuit) if ipqueue.qsize() < 20: getProxyFromWeb(ipqueue) time.sleep(2)