#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import time
import urllib.request
import conf as cf
BASE_URL = 'https://github.com/offensive-security/exploitdb/releases'
DOWNLOAD_LINK_PATTERN = 'href="(.*?)zip" rel="nofollow">'
FIRST_PATTERN = r'Next.*'
PAGE_PATTERN = r'>PreviousNext.*'
class MyCrawler:
def __init__(self, base_url=BASE_URL, start_page="first 1 page"):
self.base_url = base_url
self.start_page = start_page
# self.headers = apache_request_headers();
# 对首页的爬取
def first_page(self):
try:
req = urllib.request.Request(self.base_url)
html = urllib.request.urlopen(req)
doc