BeautifulSoup爬取京东畅销书排行榜并写入Mongodb数据库
import requests
from bs4 import BeautifulSoup
import pymongo as pm
import os
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
#获取页数,一共5页
def get_urls():
urls = []
for i in range(1,6):
url = 'http://book.jd.com/booktop/0-0-0.html?category=1713-0-0-0-10001-' + str(i) + '#comfort'
urls.append(url)
return urls
cnt = 1
#存入Mongodb数据库
def save_Mongodb(url, headers, collection):
re = requests.get(url, headers=headers)
re.encoding = 'GBK'
data = re.text
soup = BeautifulSoup(data, 'html.parser')
items = soup.find_all('div', attrs={'class': 'p-detail'})
for item in items:
title = item.a['title']
print(title)
link = item.a['href']
print (link)
details = item.find_all('dd')
author = details[0].a['title']
print(author)
pub = details[1].a['title']
print(pub)
collection.insert({'序号' : cnt, '书名': title, '链接': link, '作者': author, '出版社': pub})
myclient = pm.MongoClient('localhost', 27017)
mydb = myclient.MYDB
mycollection = mydb.rate_books
urls = get_urls()
for url in urls:
save_Mongodb(url, headers, mycollection)