代码表现时刻
import urllib.request
from bs4 import BeautifulSoup
from sprider.mongo_db.db_neihan_content import DBNeiHan
class NH(object):
def __init__(self):
self.page = 1
def get_data(self, li_list):
for q in li_list:
title = q.find_all("a")[0].string
print(title)
content_list = q.find_all("div", class_="f18 mb20")[0].strings
content_string = "".join(content_list).strip().replace(" ", "")
print(content_string)
print("==============")
data = dict()
data["title"] = title
data["content"] = content_string
yield data
def loadPage(self):
url = "https://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
print(url)
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
request = urllib.request.Request(url, headers=header)
response = urllib.request.urlopen(request)
text = response.read().decode("gb2312")
soup = BeautifulSoup(text, "html.parser")
aa = soup.find_all("ul", class_="piclist longList")
print(len(aa))
li_list = aa[0].find_all("li")
data_yield = self.get_data(li_list)
mg = DBNeiHan()
mg.insert_many(list(data_yield))
if __name__ == '__main__':
instance = NH()
instance.loadPage()