构建一个模块,调用就能实现数据查看,上传,删除
Mongomodel模块
from pymongo import MongoClient
class MongoMdel(object):
def __init__(self,db_ip,db_port,db_name,table_name):
self.db_ip = db_ip
self.db_port = db_port
self.db_name = db_name
self.table_name = table_name
self.connect = MongoClient(host=self.db_ip,port=self.db_port)
#连接到数据库
self.db = self.connect[self.db_name]
#连接到集合
self.table = self.db[self.table_name]
# 插入数据
def add(self, kv_dict):
return self.table.insert(kv_dict)
#读取一条数据
def get_one(self,query):
return self.table.find_one(query)
#读取所有数据
def get_all(self,query):
return self.table.find(query)
#删除数据
def delete(self,query):
return self.table.delete_many(query)
#检查是否含有数据
def check(self,query):
ret = self.table.find_one(query)
return ret != None
#更新数据
def update(self,query,kv_dict):
self.table.update_one(query,{'$set':kv_dict},)
主函数
import requests
from lxml import etree
from data.mongomodel import MongoMdel
#爬取页面
def spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
}
res = requests.get(url,headers=headers)
html = res.text
html = etree.HTML(html)
return html
#爬取标题
def title(href):
html = spider(href)
try:
t = html.xpath('//div[@class="bbs-hd-h1"]/h1/text()')[0]
except:
t = 'unknow'
return t
#解析页面信息
def parse(html):
#详情链接
parse_hrefs = html.xpath('//ul[@class="for-list"]//div[@class="titlelink box"]/a[@class="truetit"]/@href')
parse_hrefs = ['http://bbs.hupu.com'+href for href in parse_hrefs]
titles = []
for href in parse_hrefs:
titles.append(title(href))
print(len(parse_hrefs),len(titles),titles)
# print(parse_hrefs)
#作者
authors = html.xpath('//div[@class="author box"]/a[@class="aulink"]/text()')
# print(authors)
datas = html.xpath('//ul[@class="for-list"]/li/span[@class="ansour box"]/text()')
# print(datas)
datas = [x.split('\xa0/\xa0') for x in datas]
#回复数
replies = [x[0] for x in datas]
#浏览数
browses = [x[1] for x in datas]
# print(replies,browses)
last_times = html.xpath('//div[@class="endreply box"]/a/text()')
last_author = html.xpath('//div[@class="endreply box"]/span[@class="endauthor "]/text()')
print(last_times)
print(last_author)
#使用zip函数快捷处理数据
items = zip(titles,parse_hrefs,authors,replies,browses,last_times,last_author)
return items
def data_storage(items):
#输入自己的虚拟机地址,端口固定为27017,数据库名hupu,集合名post
hupu_post = MongoMdel('192.168.20.***',27017,'hupu','post')
for item in items:
hupu_post.add({
"titles": item[0],
"parse_hrefs": item[1],
"authors": item[2],
"replies": item[3],
"browses": item[4],
"last_times": item[5],
"last_author": item[6],
})
def main():
url = 'https://bbs.hupu.com/nba-10'
html = spider(url)
items = parse(html)
data_storage(items)
if __name__ == '__main__':
main()