import re
import time
import pymongo
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
MONGO_URL = 'localhost'
MONGO_DB = 'zhihu'
MONGO_COLLECTION = 'mujj_posts'
MAX_PAGE = 1
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
#browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def index_page(page):
"""
抓取索引页
:param page: 页码
:return:
"""
print('正在爬取第', page, '页')
try:
url = 'https://www.zhihu.com/people/WanPlusSW/posts?page=' + str(page)
browser.get(url)
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '.Pagination .PaginationButton--current'))
)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'Profile-mainColumn .List .List-item')))
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
get_posts()
except TimeoutException:
index_page()
def get_posts():
"""
提取文章数据
:return:
"""
html = browser.page_source
doc = pq(html)
items = doc('#Profile-posts .List-item').items()
for item in items:
urlstr = item.find('.ContentItem.ArticleItem').attr('data-zop')
pattern = re.compile('"itemId":(.*?),')
urlid = re.findall(pattern, urlstr)
commentstr = item.find('.ContentItem-actions .ContentItem-action.Button--plain').text()
pattern = re.compile('(\d+)')
commentnum = re.findall(pattern, commentstr)
if len(commentnum) == 0:
commentnum.append('0')
mujj_posts = {
'title': item.find('.ContentItem-title').text(),
'url': 'https://zhuanlan.zhihu.com/p/' + urlid[0],
'authorname': item.find('.ContentItem-meta .AuthorInfo-content .UserLink-link').text(),
'authorurl': 'https:' + str(item.find('.ContentItem-meta .AuthorInfo-content .UserLink-link').attr('href')),
'like': int(item.find('.ContentItem-actions .LikeButton').text()),
'commentnum': int(commentnum[0])
}
print(mujj_posts)
save_to_mongo(mujj_posts)
def save_to_mongo(result):
"""
保存至MongoDB
:param result:
:return:
"""
try:
if db[MONGO_COLLECTION].insert(result):
print('存储到MongoDB成功')
except Exception:
print('存储到MongoDB失败')
def main():
"""
遍历每一页
:return:
"""
for i in range(1, MAX_PAGE + 1):
index_page(i)
browser.close()
if __name__ == '__main__':
main()
使用python3+selenium爬取知乎文章
最新推荐文章于 2024-06-15 08:32:50 发布