python123的网址_python 2.7 无限采集URL 网址

最新推荐文章于 2021-01-30 15:13:40 发布

weixin_39725403

最新推荐文章于 2021-01-30 15:13:40 发布

阅读量723

点赞数

文章标签： python123的网址

# coding:utf8

import requests

from bs4 import BeautifulSoup

import urlparse

import datetime

import sys

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}

timeout = 5

requests.packages.urllib3.disable_warnings()

class UrlManager:

def __init__(self):

self.new_urls = set()

self.old_urls = set()

def add_new_url(self, url):

if url is None:

return None

for u in url:

if u not in self.new_urls and u not in self.old_urls:

with open(datefile, 'a') as f:

f.write(u + '\n')

self.new_urls.add(u)

def add_new_urls(self, urls):

if urls is None or len(urls) == 0:

return None

self.add_new_url(urls)

def has_new_url(self):

return len(self.new_urls) != 0

def get_new_url(self):

new_url = self.new_urls.pop()

http_url = new_url if '://' in new_url else 'http://' + new_url

self.old_urls.add(new_url)

return http_url

class HtmlDownLoader():

def download(self, url):

if url is None:

return None

r = requests.get(url=url,headers=headers, timeout=timeout, allow_redirects=True, stream=True, verify=False)

if r.status_code != 200:

return None

return r.text

class HtmlParser:

def __init__(self):

self.foreign_urls = set()

def _get_root_domain(self, url):

if url is None:

return None

url_info = urlparse.urlparse(url)

root_domain = url_info.netloc

if root_domain is not None and root_domain is not '':

return root_domain

def _get_new_urls(self, soup, current_url):

new_urls = set()

links = soup.find_all("a")

for link in links:

new_url = link.get('href')

if new_url is not None:

new_url = new_url.strip()

new_url_root_domain = self._get_root_domain(new_url)

if new_url_root_domain != self._get_root_domain(current_url):

if new_url_root_domain is not None:

new_urls.add(new_url_root_domain)

return new_urls

def parse(self, html_content, current_url):

if html_content is None:

return

soup = BeautifulSoup(html_content, "html.parser")

new_urls = self._get_new_urls(soup, current_url)

return new_urls

def get_foreign_urls(self):

return self.foreign_urls

class SpiderMain:

def __init__(self, ):

global datefile

self.urls = UrlManager()

self.html_downloader = HtmlDownLoader()

self.parser = HtmlParser()

def craw(self, root_url):

self.urls.add_new_url(root_url)

while self.urls.has_new_url():

new_url = self.urls.get_new_url()

try:

html_content = self.html_downloader.download(new_url)

new_urls = self.parser.parse(html_content, new_url)

self.urls.add_new_urls(new_urls)

# TODO

for i in new_urls:

print "-- %s" % i

except Exception:

print "++ %s" % new_url

except KeyboardInterrupt:

print '[-] User exit!'

sys.exit(1)

if __name__ == "__main__":

datefile = datetime.datetime.now().strftime('%Y%m%d_%H-%M-%S.txt')

with open(datefile, 'w'):

pass

root_url = set()

root_url.add("www.hao123.com")

obj_spider = SpiderMain()

obj_spider.craw(root_url)

weixin_39725403

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python123的网址_python 2.7 无限采集URL 网址

# coding:utf8import requestsfrom bs4 import BeautifulSoupimport urlparseimport datetimeimport sysheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Geck...
复制链接

扫一扫