实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
丁香园论坛:http://www.dxy.cn/bbs/thread/626626#626626 。
from selenium import webdriver
from time import sleep
import random
from bs4 import BeautifulSoup
import json
def open():
browser = webdriver.Firefox()
# browser = webdriver.Chrome()
return browser
def web_login(browser, url):
browser.get(url)
sleep(random.random()*3) # 等待一段随机时间
login_link = browser.find_element_by_link_text('返回电脑登录')
login_link.click()
sleep(random.random()*3) # 等待一段随机时间
username = browser.find_element_by_name('username')
username.clear()
username.send_keys('username')
sleep(random.random()*3) # 等待一段随机时间
password = browser.find_element_by_name('password')
password.send_keys('password')
sleep(random.random()*3) # 等待一段随机时间
browser.find_element_by_class_name('button').click()
sleep(10)
cookie = browser.get_cookies()
return {i['name']: i['value'] for i in cookie}
def get_web_detail(browser, url):
browser.get(url)
sleep(random.random()*8) # 等待一段随机时间
data = {}
soup = BeautifulSoup(browser.page_source, 'html.parser')
data['title'] = soup.title.string
data['replies'] = soup.select('.postbody')
data['authors'] = soup.select('.auth')
return data
def show_data(data):
print(f'Title: {data["title"]}')
replies = []
for reply in data['replies']:
replies.append(reply.text.strip())
authors = []
for author in data['authors']:
authors.append(author.text)
for i in range(len(replies)):
print(f'作者: {authors[i]} \n 回复内容: {replies[i]}')
def main():
browser = open()
web_login(browser, login_url)
data = get_web_detail(browser, url)
show_data(data)
if __name__ == '__main__':
login_url = 'https://auth.dxy.cn/accounts/login'
url = 'http://www.dxy.cn/bbs/thread/626626# 626626'
main()