urllib mongo 爬去内涵段子 BeautifulSoup

代码表现时刻

import urllib.request
from bs4 import BeautifulSoup
from sprider.mongo_db.db_neihan_content import DBNeiHan


class NH(object):
    def __init__(self):
        self.page = 1

    def get_data(self, li_list):
        for q in li_list:
            title = q.find_all("a")[0].string
            print(title)
            content_list = q.find_all("div", class_="f18 mb20")[0].strings
            content_string = "".join(content_list).strip().replace(" ", "")
            print(content_string)
            print("==============")

            data = dict()
            data["title"] = title
            data["content"] = content_string
            yield data

    def loadPage(self):
        url = "https://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
        print(url)
        # url = "https://www.neihan8.com/article/list_5_1.html"
        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}

        request = urllib.request.Request(url, headers=header)

        response = urllib.request.urlopen(request)
        text = response.read().decode("gb2312")
        soup = BeautifulSoup(text, "html.parser")
        aa = soup.find_all("ul", class_="piclist longList")
        print(len(aa))
        li_list = aa[0].find_all("li")
        data_yield = self.get_data(li_list)
        mg = DBNeiHan()
        mg.insert_many(list(data_yield))


if __name__ == '__main__':
    instance = NH()
    instance.loadPage()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值