基于python的网络爬虫系统_WebCollector-Python

WebCollector-Python

WebCollector-Python 是一个无须配置、便于二次开发的 Python 爬虫框架(内核),它提供精简的的 API,只需少量代码即可实现一个功能强大的爬虫。

WebCollector Java版本

WebCollector Java版相比WebCollector-Python具有更高的效率: https://github.com/CrawlScript/WebCollector

安装

pip安装命令

pip install https://github.com/CrawlScript/WebCollector-Python/archive/master.zip

示例

Basic

快速入门

自动探测URL

# coding=utf-8

import webcollector as wc

class NewsCrawler(wc.RamCrawler):

def __init__(self):

super().__init__(auto_detect=True)

self.num_threads = 10

self.add_seed("https://github.blog/")

self.add_regex("+https://github.blog/[0-9]+.*")

self.add_regex("-.*#.*") # do not detect urls that contain "#"

def visit(self, page, detected):

if page.match_url("https://github.blog/[0-9]+.*"):

title = page.select("h1.lh-condensed")[0].text.strip()

content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip()

print("\nURL: ", page.url)

print("TITLE: ", title)

print("CONTENT: ", content[:50], "...")

crawler = NewsCrawler()

crawler.start(10)

手动探测URL

# coding=utf-8

import webcollector as wc

class NewsCrawler(wc.RamCrawler):

def __init__(self):

super().__init__(auto_detect=False)

self.num_threads = 10

self.add_seed("https://github.blog/")

def visit(self, page, detected):

detected.extend(page.links("https://github.blog/[0-9]+.*"))

if page.match_url("https://github.blog/[0-9]+.*"):

title = page.select("h1.lh-condensed")[0].text.strip()

content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip()

print("\nURL: ", page.url)

print("TITLE: ", title)

print("CONTENT: ", content[:50], "...")

crawler = NewsCrawler()

crawler.start(10)

用detected_filter插件过滤探测到的URL

# coding=utf-8

import webcollector as wc

from webcollector.filter import Filter

import re

class RegexDetectedFilter(Filter):

def filter(self, crawl_datum):

if re.fullmatch("https://github.blog/2019-02.*", crawl_datum.url):

return crawl_datum

else:

print("filtered by detected_filter: {}".format(crawl_datum.brief_info()))

return None

class NewsCrawler(wc.RamCrawler):

def __init__(self):

super().__init__(auto_detect=True, detected_filter=RegexDetectedFilter())

self.num_threads = 10

self.add_seed("https://github.blog/")

def visit(self, page, detected):

detected.extend(page.links("https://github.blog/[0-9]+.*"))

if page.match_url("https://github.blog/[0-9]+.*"):

title = page.select("h1.lh-condensed")[0].text.strip()

content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip()

print("\nURL: ", page.url)

print("TITLE: ", title)

print("CONTENT: ", content[:50], "...")

crawler = NewsCrawler()

crawler.start(10)

用RedisCrawler进行可断点的采集(可在关闭后恢复)

# coding=utf-8

from redis import StrictRedis

import webcollector as wc

class NewsCrawler(wc.RedisCrawler):

def __init__(self):

super().__init__(redis_client=StrictRedis("127.0.0.1"),

db_prefix="news",

auto_detect=True)

self.num_threads = 10

self.resumable = True # you can resume crawling after shutdown

self.add_seed("https://github.blog/")

self.add_regex("+https://github.blog/[0-9]+.*")

self.add_regex("-.*#.*") # do not detect urls that contain "#"

def visit(self, page, detected):

if page.match_url("https://github.blog/[0-9]+.*"):

title = page.select("h1.lh-condensed")[0].text.strip()

content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip()

print("\nURL: ", page.url)

print("TITLE: ", title)

print("CONTENT: ", content[:50], "...")

crawler = NewsCrawler()

crawler.start(10)

用Requests定制Http请求

# coding=utf-8

import webcollector as wc

from webcollector.model import Page

from webcollector.plugin.net import HttpRequester

import requests

class MyRequester(HttpRequester):

def get_response(self, crawl_datum):

# custom http request

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

}

print("sending request with MyRequester")

# send request and get response

response = requests.get(crawl_datum.url, headers=headers)

# update code

crawl_datum.code = response.status_code

# wrap http response as a Page object

page = Page(crawl_datum,

response.content,

content_type=response.headers["Content-Type"],

http_charset=response.encoding)

return page

class NewsCrawler(wc.RamCrawler):

def __init__(self):

super().__init__(auto_detect=True)

self.num_threads = 10

# set requester to enable MyRequester

self.requester = MyRequester()

self.add_seed("https://github.blog/")

self.add_regex("+https://github.blog/[0-9]+.*")

self.add_regex("-.*#.*") # do not detect urls that contain "#"

def visit(self, page, detected):

if page.match_url("https://github.blog/[0-9]+.*"):

title = page.select("h1.lh-condensed")[0].text.strip()

content = page.select("div.markdown-body")[0].text.replace("\n", " ").strip()

print("\nURL: ", page.url)

print("TITLE: ", title)

print("CONTENT: ", content[:50], "...")

crawler = NewsCrawler()

crawler.start(10)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值