内涵段子——脑筋急转弯——spider

最新推荐文章于 2024-11-02 16:28:26 发布

weixin_30911809

最新推荐文章于 2024-11-02 16:28:26 发布

阅读量179

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/Skyda/p/10006672.html

版权

# python 3.7
from urllib.request import Request,urlopen
import re,time

class Neihan(object):
    def __init__(self):
        self.header={
            'Host': 'www.neihan8.com',
            'Referer': 'https: // www.neihan8.com / njjzw //',
            'Upgrade - Insecure - Requests': 1,
            'User - Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            "Cookie": 'UM_distinctid=1673e837ae7146-0363c5477e0b8a-424f0928-13c680-1673e837ae9355; CNZZDATA1274349754=965294396-1542939999-%7C1542939999; Hm_lvt_94f4eb93f17efa632a5c8a01b23da410=1542942067; npreuecookieclassrecord=%2C2%2C14%2C1%2C; CNZZDATA5804950=cnzz_eid%3D222162018-1542942068-https%253A%252F%252Fwww.neihan8.com%252F%26ntime%3D1542942068; Hm_lpvt_94f4eb93f17efa632a5c8a01b23da410=1542943190'
        }
        self.static = 'https://www.neihan8.com/njjzw/'

    def getPage(self,url,refer=None):
        res = urlopen(Request(url=url,headers=self.header)).read()
        self.parsePage(res.decode(),refer)

    def parsePage(self,htmlres,*args):
        patten = 'class="title" title=".*?">(.*?)</a></h3>\s+<div class="desc">(.*?)</div>'
        p = re.findall(patten,htmlres)
        self.writePge(p,args)

    def writePge(self,p,*args):

        with open('11.txt','a+',encoding='utf8') as f:
            print(args)
            for i in p:
                if args[0][0] is not None:
                    print(args)
                    f.write('问题：'+i[0]+'\n'+args[0][0]+i[1].strip()+'\n')
                else:
                    f.write('问题：'+i[0]+'\n'+i[1].strip()+'\n')
                f.write('\n')

    def workon(self):
        # 爬取 20 页
        for i in range(1,10):
            if i == 1:
                url = self.static
                self.getPage(url, refer='答案:')
            else:
                url = self.static+'index_%s.html'%i
                self.getPage(url)

            time.sleep(2)

if __name__ == '__main__':
    spider  = Neihan()
    spider.workon()