Python爬虫

最新推荐文章于 2022-04-30 14:01:51 发布

b2b160

最新推荐文章于 2022-04-30 14:01:51 发布

阅读量1.2w

点赞数

分类专栏：脚本文章标签： python image url html 网络爬虫 import

本文链接：https://blog.csdn.net/b2b160/article/details/3906174

版权

脚本专栏收录该内容

35 篇文章 1 订阅

订阅专栏

(1)Python版简单网络爬虫

URLLister类，负责从HTML文本中根据<href>标签提取URL，但也会提取一些垃圾URL，有待改进

from sgmllib import SGMLParser

class URLLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []

    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            self.urls.extend(href)

getURL(url)用来将HTML中的url放入urls列表中

import urllib, urllister

def getURL(url):
    try:
        usock = urllib.urlopen(url)
    except:
        print 'get url excepton'
        return []
    parser = urllister.URLLister()
    parser.feed(usock.read())
    usock.close()
    parser.close()
    urls = parser.urls
    return urls

spider(startURL,depth)递归调用getURL(url)，startURL为起始URL，depth为递归次数，及遍历的深度

def spider(startURL, depth):
i = 0
global num      #num为全局变量，用来记录打印的url的数目
if depth <= i:
   return 0
else:
   urls = getURL(startURL)
        if len(urls) > 0:
            for url in urls:
        print url, num
        num = num + 1
        spider(url,depth - 1)
    else:
            return 0
return 1

调用spider

num = 0
spider("http://www.xjtu.edu.cn/",2)

运行结果：

得到962个url，不过其中有一些不符合URL格式的“假货”，比如“nav.php?type=2 944”

“http://gs.xjtu.edu.cn/zhaos/upload/files/462498fcd3d35.doc 285”等

而且每次到.doc这里都会很费时间，可见这个“假货”处理速度很慢。

用正则表达式可以判断URL的合法性，待续……

(2)小小的爬虫程序

Python有一个urllib的库，可以很方便的从给定的url抓取网页，以下这段程序实现了抓取一个url并存到指定文件的功能：

爬虫工作的基本原理就是，给定一个初始的url，下载这个url的网页，然后找出网页上所有满足下载要求的链接，然后把这些链接对应的url下载下来，然后再找下载下来的这些网页的url，我们可以用广度优先搜索实现这个算法，不过，首先得有一个函数找出网页上所有的满足要求的url，下面这个例子用正则表达式找出url.

最后就是广度优先搜索了，这个实现起来也很简单：

作者用上面的算法，感觉速度还行，1小时可以抓10000多网页，可以满足小型系统的要求。

(3)小爬虫1.0

----Spider.py--------
#!/usr/local/bin/python
from mySpider import *

def main():

mySpider = miniHTMLParser()
Site = mySpider.site_find()
for s in Site:

      mySpider.search_keyword = str(s.contents[27].contents[0]).strip()
      domain = str(s.contents[1].contents[0]).strip()
      root_url = str(s.contents[3].contents[0]).strip()
      variable_url = str(s.contents[5].contents[0]).strip()
      image_save_location = str(s.contents[7].contents[0]).strip()
      image_name_len = s.contents[9].contents[0]
      image_name_len = int(image_name_len.strip())
      html_file = str(s.contents[11].contents[0]).strip()
      xml_file = str(s.contents[25].contents[0]).strip()
      description_index = str(s.contents[13].contents[0]).strip()
      description_keyword = str(s.contents[15].contents[0]).strip()
      name_index = str(s.contents[17].contents[0]).strip()
      name_keyword = str(s.contents[19].contents[0]).strip()
      image_index = str(s.contents[21].contents[0]).strip()
      image_keyword = str(s.contents[23].contents[0]).strip()

      link = root_url+variable_url
      xmlist = []

      #while link != '':
      while link != root_url :
        print "/nChecking link ", domain+link

        raw_html = mySpider.gethtmlfile( domain, link )
        mySpider.raw_html = raw_html
        mySpider.encode = 'utf-8'
        cleaned_html = mySpider.html_feed()
        # Get Content
        clean_theater_name = mySpider.clean_html( name_index, name_keyword )
        clean_theater_description = mySpider.clean_html( description_index, description_keyword )
        # Save Image
        try:
            image_html_source = mySpider.soup_find_from_source( image_index, image_keyword )
            image_url = mySpider.soup_find_img_src(domain, image_html_source)
            image_name = link[image_name_len:].replace('=', '0')
            image_name = image_name.replace('/', '')
            image_local = image_save_location + domain + image_url[-4:]
            mySpider.save_image(image_name, domain, image_url)
        except:
            image_url = 'None'
            image_local = 'None'

        xmlist.append(('theater', [('name',clean_theater_name),('description',clean_theater_description),('image_remote',image_url),('image_local',image_local)] ))
        mySpider.save_html(html_file,cleaned_html)

        mySpider.feed( cleaned_html )
        next_link = mySpider.get_next_link()

        if next_link[0:1] != '/' :
            next_link = root_url + next_link
        link = next_link

      mySpider.close()
      print "/ndone/n"
      xml_input = ('Spider', [('Content', xmlist)] )
      mySpider.save_xml(xml_file,xml_input)

if __name__ == "__main__":
main()

---mySpider.py----
#!/usr/local/bin/python

import httplib
import sys
import re
from pyfo import pyfo
import urllib2

from BeautifulSoup import BeautifulSoup ,SoupStrainer ,BeautifulStoneSoup
from HTMLParser import HTMLParser

class miniHTMLParser( HTMLParser ):

viewedQueue = []
instQueue = []
search_keyword = ''
encode = 'utf-8'
raw_html = ''
config = 'config.xml'

def get_next_link( self ):
    if self.instQueue == []:
      return ''
    else:
      return self.instQueue.pop(0)

def gethtmlfile( self, site, page ):
    try:
      httpconn = httplib.HTTPConnection(site)
      httpconn.request("GET", page)
      resp = httpconn.getresponse()
      resppage = resp.read()
    except:
      resppage = ""
    return resppage

def handle_starttag( self, tag, attrs ):
    if tag == 'a':
      newstr = str(attrs[0][1])
      if re.search('http', newstr) == None:
        if re.search('mailto', newstr) == None:
          if re.search( self.search_keyword, newstr) != None:
            if (newstr in self.viewedQueue) == False:
              print " adding", newstr
              self.instQueue.append( newstr )
              self.viewedQueue.append( newstr )
          else:
            print " ignoring", newstr
        else:
          print " ignoring", newstr
      else:
        print " ignoring", newstr

def clean_html(self, index, keyword):

       soup = self.get_soup()
       html_source = str( soup.findAll( attrs={ index : re.compile(keyword+"$")}) )
       clean_html_source = re.sub('<(?!(?:a/s|/a|!))[^>]*>','',html_source)
       clean_html_source = clean_html_source.replace('/n', '')
       clean_html_source = clean_html_source.replace('[', '')
       clean_html_source = clean_html_source.replace(']', '')
       clean_html_source = clean_html_source.strip()
       return clean_html_source

def clean_description_list(self, index, keyword):

      clean_description_list = []
      clean_html_source = self.clean_html(index, keyword)
      t = clean_html_source.split(':')
      for g in t:
            g = g.strip()
            s = g.split(' ')
            for tag in s:
                if tag != '':
                    tag = tag.strip()
                    tag = tag[0:10]
                    if tag != '<a href="/' :
                        description_list.append(tag)
      return clean_description_list

def soup_find_from_source(self, index, keyword):

      soup = self.get_soup()
      html_source = str( soup.findAll( attrs={ index : re.compile(keyword+"$")}) )
      return html_source

def soup_find_img_src(self, domain, image_html_source):

      links = SoupStrainer('img')
      image_tag = BeautifulSoup(image_html_source, parseOnlyThese=links)
      image_link = image_tag.contents[0].attrs[0][1]
      if image_link[0:4] != 'http' :
         image_link = 'http://'+domain+image_link
      return image_link

def save_xml(self, xml_file, xml_input):

    xml_result = pyfo(xml_input, pretty=True, prolog=True, encoding='utf-8')
    xml = open( xml_file,'w' )
    xml.write( xml_result+"/n" )
    xml.close()

def save_html(self, html_file, html_input):

     html = open( html_file,'a' )
     html.write ( html_input )
     html.close()

def save_image(self, imge_name, domain, image_link):

      try:
          image_connection = urllib2.build_opener()
          image = image_connection.open(image_link)
          image_source = image.read()
          final_image_name = 'image/'+ domain + '-' + imge_name + image_link[-4:]
          fout = open(final_image_name, "wb")
          fout.write(image_source)
          fout.close()
          respond = 'ok'
      except:
          respond = ''
      return respond

def html_feed(self):

    soup = self.get_soup()
    cleaned_html = soup.prettify()
    return cleaned_html

def get_soup(self):

    soup = BeautifulSoup( self.raw_html )
    soup.__str__(self.encode)
    return soup

def load_config(self):

      config=open(self.config , 'r')
      config_xml = config.read()
      xmlSoup = BeautifulStoneSoup(config_xml)
      return xmlSoup

def site_find(self):

      siteSoup = self.load_config()
      xmlSite = siteSoup.findAll('site')
      return xmlSite

-----Config.XML--------
<?xml version="1.0" encoding="utf-8"?>
<spider>
<site>
      <domain>www.example1.com.cn</domain>
      <root_url>/movie/</root_url>
      <variable_url >theater.asp</variable_url>
      <image_save_location>image/</image_save_location>
      <image_name_len>-2</image_name_len>
      <html_file>Crawling Page/piao.html</html_file>
      <description_index>bgcolor</description_index>
      <description_keyword>#2B2B2B</description_keyword>
      <theater_index>bgcolor</theater_index>
      <theater_keyword>#4B4B4B</theater_keyword>
      <image_index>width</image_index>
      <image_keyword>169</image_keyword>
      <xml_file>Crawing Content/piao.xml</xml_file>
      <search_keyword>theater.asp</search_keyword>
</site>
<site>
      <domain>www.example.com</domain>
      <root_url>/theater/intro/BeiJing/</root_url>
      <variable_url>CHANGHONG/</variable_url>
      <image_save_location>image/</image_save_location>
      <image_name_len>-5</image_name_len>
      <html_file>Crawling Page/mvgod.html</html_file>
      <description_index>class</description_index>
      <description_keyword>main</description_keyword>
      <name_index>class</name_index>
      <name_keyword>namezi</name_keyword>
      <image_index>style</image_index>
      <image_keyword>float:right;margin:4px</image_keyword>
      <xml_file>Crawing Content/mvgod.xml</xml_file>
      <search_keyword>/theater/intro/BeiJing/</search_keyword>
</site>
</spider>

(4)用python编写网络爬虫

刚刚开了一个《计算机网络》的课，觉得很有用。正好师兄让我练习编写一个能下载网站网页的程序，正好能用上课上的知识了。为了想作一个效率不差的，而下载网页的性能瓶颈是在网络上，所有决定用Python编写代码。刚学python没几天，学习一种语言的最好方法就是写code.下面的是我用的多线程实现的网络爬虫，并用py2exe生成了一个exe，自身觉得py2exe不太好，又不会更好的，只能......
这是我这些天的成果。希望有人能提出好的建议，先谢谢了！一共两个文件，一个是toolbox_insight.py，是一个工具文件另一个是test.py，是一个用到toolbox_insight.py中工具的测试文件

#FileName: toolbox_insight.py
from sgmllib import SGMLParser
import threading
import time
import urllib2
import StringIO
import gzip
import string
import os
#rewrite SGMLParser for start_a
class Basegeturls(SGMLParser):   #这个Basegeturls类作用是分析下载的网页，把网页中的所有链接放在self.url中。
    def reset(self):
        self.url = []
        SGMLParser.reset(self)

    def start_a(self, attrs):
        href = [v for k, v in attrs if k == 'href']
        if href:
            self.url.extend(href)

#for quickly finding
class Newlist(list):#这个类其实是一个添加了find方法的LIST。当num变量在LIST中，返回True,当不在LIST中，返回False并把num按二分法插入LIST中
    def find(self, num):
        l = len(self)
        first = 0
        end = l - 1
        mid = 0
        if l == 0:
            self.insert(0,num)
            return False
        while first < end:
            mid = (first + end)/2
            if num > self[mid]:
                first = mid + 1
            elif num < self[mid]:
                end = mid - 1
            else:
                break
        if first == end:
            if self[first] > num:
                self.insert(first, num)
                return False
            elif self[first] < num:
                self.insert(first + 1, num)
                return False
            else:
                return True
        elif first > end:
            self.insert(first, num)
            return False
        else:
            return True

#下面的reptile顾名思义是一个爬虫
class reptile(threading.Thread):
    #Name:       是爬虫是名字，queue是任务队列，所有的爬虫共用同一个任务队列
    #从中取出一个任务项进行运行，每个任务项是一个要下载网页的URL
    #result:     也是一个队列，将下载的网页中包含的URL放入该队列中
    #inittime:   在本程序中没有用，只是一个为了以后扩展用的
    #downloadway:是下载的网页存放的路径
    #configfile: 是配置文件，存放网页的URL和下载下后的路径
    #maxnum:     每个爬虫有个最大下载量，当下载了这么多网页后，爬虫dead
    def __init__(self, Name, queue, result, Flcok, inittime = 0.00001, downloadway = 'D://bbs//',configfile = 'D://bbs//conf.txt', maxnum = 10000):
        threading.Thread.__init__(self, name = Name)
        self.queue = queue
        self.result = result
        self.Flcok = Flcok
        self.inittime = inittime
        self.mainway = downloadway
        self.configfile = configfile
        self.num = 0          #已下载的网页个数
        self.maxnum = maxnum
        os.makedirs(downloadway + self.getName())      #系统调用：在存放网页的文件夹中创建一个以该爬虫name为名字的文件夹
        self.way = downloadway + self.getName() + '//'
    def run(self):
        opener = urllib2.build_opener()     #创建一个开启器
        while True:
            url = self.queue.get()          #从队列中取一个URL
            if url == None:                 #当取得一个None后表示爬虫结束工作，用于外部方便控制爬虫的生命期
                break
            parser = Basegeturls()          #创建一个网页分析器
            request = urllib2.Request(url) #网页请求
            request.add_header('Accept-encoding', 'gzip')#下载的方式是gzip压缩后的网页，gzip是大多数服务器支持的一种格式
            try:                                         #这样可以减轻网络压力
                page = opener.open(request)#发送请求报文
                if page.code == 200:       #当请求成功
                    predata = page.read() #下载gzip格式的网页
                    pdata = StringIO.StringIO(predata)#下面6行是实现解压缩
                    gzipper = gzip.GzipFile(fileobj = pdata)
                    try:
                        data = gzipper.read()
                    except(IOError):
                        print 'unused gzip'
                        data = predata#当有的服务器不支持gzip格式，那么下载的就是网页本身
                    try:
                        parser.feed(data)#分析网页
                    except:
                        print 'I am here'#有的网页分析不了，如整个网页就是一个图片
                    for item in parser.url:
                        self.result.put(item)#分析后的URL放入队列中
                    way = self.way + str(self.num) + '.html'#下面的是网页的保存，不多说了
                    self.num += 1
                    file = open(way, 'w')
                    file.write(data)
                    file.close()
                    self.Flcok.acquire()
                    confile = open(self.configfile, 'a')
                    confile.write( way + ' ' + url + '/n')
                    confile.close()
                    self.Flcok.release()
                page.close()
                if self.num >= self.maxnum:#达到最大量后退出
                    break
            except:
                print 'end error'

#和爬虫一样是个线程类,作用是将爬虫中的result中存入的URL加以处理。只要同一个服务器的网页
class proinsight(threading.Thread):
    def __init__(self, queue, list, homepage, inqueue):
        threading.Thread.__init__(self)
        self.queue = queue#和爬虫中的result队列是同一个
        self.list = list#是上面Newlist的对象
        self.homepage = homepage#主页
        self.inqueue = inqueue#处理完后的URL的去处
    def run(self):
        length = len(self.homepage)
        while True:
            item = self.queue.get()
            if item == None:
                break
            if item[0:4] == '/r/n':
                item = item[4:]
            if item[-1] == '/':
                item = item[:-1]
            if len(item) >= len('http://') and item[0:7] == 'http://':
                if len(item) >= length and item[0:length] == self.homepage:
                    if self.list.find(item) == False:
                        self.inqueue.put(item)
            elif item[0:5] == '/java' or item[0:4] == 'java':
                pass
            else:
                if item[0] != '/':
                    item = '/' + item
                item = self.homepage + item
                if self.list.find(item) == False:
                    self.inqueue.put(item)
下面的是一个主函数过程
我下载的网站是 http://bbs.hit.edu.cn
开始网页是 http://bbs.hit.edu.cn/mainpage.php
#FileName:test
from toolbox_insight import *
from Queue import Queue
import threading
import sys
num = int(raw_input('Enter the number of thread:'))
pnum = int(raw_input('Enter the number of download pages:'))
mainpage = str(raw_input('The mainpage:'))
startpage = str(raw_input('Start page:'))
queue = Queue()
key = Queue()
inqueue = Queue()
list = Newlist()
thlist = []
Flock = threading.RLock()
for i in range(num):
    th = reptile('th' + str(i), queue, key, Flock)
    thlist.append(th)
pro = proinsight(key, list, mainpage, inqueue)
pro.start()
for i in thlist:
    i.start()
queue.put(startpage)
for i in range(pnum):
    queue.put(inqueue.get())
for i in range(num):
    queue.put(None)

个人觉得用wxpython来实现用户界面和用数据库知识查找URL是更好的扩展方向

用python编写分布式爬虫

1、网络连接需要持续连接（persistent connection），DNS解析的瓶颈（先查本地DNS缓存）

实现方法:基于python httplib（对http1.1完成对持续连接的支持(python的httplib完全支持http1.1)，如果不是http1.1那么可以使用 urlopen对其进行一次连接）并对其socket对象进行控制,关键是加入对读取DNS本地缓存(在我的机制下这个根本就不是主要问题可以暂时忽略)，以及有settimeout(Igloo)(搞定,就用setdefaulttimeout())的支持(或者利用自己的DNS服务器,进行优化处理),以及对sock对象的settimeout进行设置,防止长时间的等待一个有可能连接不上的web服务器.(要测试一下连接模块和DNS解析模块在访问不存在url在默认情况下的时间消耗)对站点的ip解析出来后就直接用ip进行连接而避免了重复调用DNS解析.例子: socket.gethostbyname("www.163.com")

网络连接下载模块非常重要，需要精心反复测试，因为有可能碰到一些不规范的web服务器，如果没有加以考虑会使整个线程崩溃。

2、多线程:机器任务的分配及站点任务的分配。

实现方法：（在某台机器上实现，在对本机内存cpu的消耗情况判断后对机器任务进行分配；在对和站点的连接情况进行判断后对站点任务进行分配）
机器任务的分配：对于机器负担的情况调整在一个机器开的线程的个数。（在关闭线程时注意要先让线程完成当前运行任务）
站点任务的分配：就是某个机器对一个站点开的线程的个数的分配。（同样是要注意关闭线程时先让其完成当前任务）

3、对web文件树遍历过程更好的控制，对web文件树在广度优先遍历时层次的判断。（整个网络是一个图，而某个站点的模型更接近于一棵树）

实现方法：在每个地址进入队列时加一个层次号，那么要遍历第n层的话那么遍历到第一个n+1就停止读取。

4、利用robotparser解析robots.txt

5、单个机器spider的作用：

a) 同2多线程3文件树的遍历

b) 将获取的外部url发回中央控制器，并从中央控制器取回新的外部url。

6、中央控制器的作用：

a) 观察各机器的状态包括：cpu、内存、线程、站点、网络流量
b) 观察对外整体网络流量和连接状况，可以根据网络状况来调节timeout。

c) 接受各个机器发送过来的外部url并对每个url的重复数字计数。然后分配到各个机器。（分配时要用爬行策略控制器对外部url进行排序来分配，Igloo利用Page Rank，我们可以使用最简单的重复越多重要系数就越高来进行排序）

d) 分布式URL分配算法：Igloo1.2的二级哈希映射算法（集中式分配算法那个中央控制器容易成为系统瓶颈）复习哈希算法，还有就是对url是否访问过的判断（Igloo使用的是URL Trie滞后合并策略）。可以使用Berkeley DB作为URL Trie的替代品。两种实现方式的比较：

i. 现在的想法：（面向站点，信息颗粒大）外部链接只是保存主机名比如:www.163.com, 站内访问用解析好的ip地址维持连接，用相对链接来得到各个页面，这样就要维护一个外部链接列表，几个站点的链接列表。优点：节省内存，对某个站点的信息获取全面，对站点的出现频率统计，排序，重要站点先取。缺点：对链接的获取的全面性得不到保证，而且不能获取更多的重要页面，每个站点的重要页面也不会很多。
ii. 老方案：（面向页面，信息颗粒小）所有连接一视同仁。缺点：浪费资源，对单一站点的获取不一定全面。优点：可以得到全面的链接图，可以使用Page Rank对列表进行排序，页面更重要就在最前面。

7、解析html（超级链接的提取）搞定（用python的sgmllib）缺点：速度太慢（可能会造成瓶颈，要好好包装好，以后有机会换掉它）

5多线程实例

# -*- coding:utf-8 -*-
import urllib, httplib
import thread
import time
from Queue import Queue, Empty, Full
HEADERS = {"Content-type": "application/x-www-form-urlencoded",
						'Accept-Language':'zh-cn',
						'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0;Windows NT 5.0)',
						"Accept": "text/plain"}
UNEXPECTED_ERROR = -1
POST = 'POST'
GET = 'GET'
def base_log(msg):
	print msg
def base_fail_op(task, status, log):
	log('fail op. task = %s, status = %d'%(str(task), status))
def get_remote_data(tasks, results, fail_op = base_fail_op, log = base_log):
	while True:
		task = tasks.get()
		try:
			tid = task['id']
			hpt = task['conn_args'] # hpt <= host:port, timeout
		except KeyError, e:
			log(str(e))
			continue
		log('thread_%s doing task %d'%(thread.get_ident(), tid))
		#log('hpt = ' + str(hpt))
		conn = httplib.HTTPConnection(**hpt)
			
		try:
			params = task['params']
		except KeyError, e:
			params = {}
		params = urllib.urlencode(params)
		#log('params = ' + params)
		
		try:
			method = task['method']
		except KeyError:
			method = 'GET'
		#log('method = ' + method)
		
		try:
			url = task['url']
		except KeyError:
			url = '/'
		#log('url = ' + url)
		
		headers = HEADERS
		try:
			tmp = task['headers']
		except KeyError, e:
			tmp = {}
		headers.update(tmp)
		#log('headers = ' + str(headers))
		headers['Content-Length'] = len(params)
		
		try:
			if method == POST:
				conn.request(method, url, params, headers)
			else:
				conn.request(method, url + params)
			response = conn.getresponse()
		except Exception, e:
			log('request failed. method = %s, url = %s, params = %s headers = %s'%(
						method, url, params, headers))
			log(str(e))
			fail_op(task, UNEXPECTED_ERROR, log)
			continue
			
		if response.status != httplib.OK:
			fail_op(task, response.status, log)
			continue
			
		data = response.read()
		results.put((tid, data), True)
		
class HttpPool(object):
	def __init__(self, threads_count, fail_op, log):
		self._tasks = Queue()
		self._results = Queue()
		
		for i in xrange(threads_count):
			thread.start_new_thread(get_remote_data, 
															(self._tasks, self._results, fail_op, log))
			
	def add_task(self, tid, host, url, params, headers = {}, method = 'GET', timeout = None):
		task = {
			'id' : tid,
			'conn_args' : {'host' : host} if timeout is None else {'host' : host, 'timeout' : timeout},
			'headers' : headers,
			'url' : url,
			'params' : params,
			'method' : method,
			}
		try:
			self._tasks.put_nowait(task)
		except Full:
			return False
		return True
		
	def get_results(self):
		results = []
		while True:
			try:
				res = self._results.get_nowait()
			except Empty:
				break
			results.append(res)
		return results
		
def test_google(task_count, threads_count):
	hp = HttpPool(threads_count, base_fail_op, base_log)
	for i in xrange(task_count):
		if hp.add_task(i,
				'www.google.cn',
				'/search?',
				{'q' : 'lai'},
#				method = 'POST'
				):
			print 'add task successed.'
			
	while True:
		results = hp.get_results()
		if not results:
			time.sleep(1.0 * random.random())
		for i in results:
			print i[0], len(i[1])
#			print unicode(i[1], 'gb18030')
			
if __name__ == '__main__':
	import sys, random
	task_count, threads_count = int(sys.argv[1]), int(sys.argv[2])
	test_google(task_count, threads_count)

b2b160

关注

0
点赞
踩
19

收藏

觉得还不错? 一键收藏
打赏
1
评论
Python爬虫

(1)Python版简单网络爬虫 URLLister类，负责从HTML文本中根据标签提取URL，但也会提取一些垃圾URL，有待改进from sgmllib import SGMLParserclass URLLister(SGMLParser): def reset(self):
复制链接

扫一扫