4.1 Task7 实战大项目
实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容。
丁香园论坛:http://www.dxy.cn/bbs/thread/626626#626626 。
1.首先使用Selenium模拟登陆丁香园论坛。这里先模拟点击登陆按钮,然后模拟点击返回电脑登陆按钮,然后定位账号输入框和密码登录框,并输入相应的账户及密码。
from selenium import webdriver
import time
def main():
#打开浏览器
path = r'D:\Google download\chromedriver_win32\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path)
url = 'http://www.dxy.cn/bbs/thread/626626#626626'
browser.get(url)
time.sleep(1)
# 找到登录按钮
browser.find_element_by_xpath('//*[@class="activate-info-tip-btn"][2]').click()
#通过xpath定位到返回电脑登录按钮,并点击
browser.find_element_by_xpath('//*[@class="login__tab j-tab"]').click()
time.sleep(1)
username = browser.find_element_by_name('username')
username.send_keys('你的账号')
password = browser.find_element_by_name('password')
password.send_keys('你的密码')
browser.find_element_by_xpath('//*[@class="button"]').click()
if __name__ == '__main__':
main()
登录多了会出现验证码登录的问题,先不予解决
2.爬取相关信息
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
def main():
#打开浏览器
path = r'D:\Google download\chromedriver_win32\chromedriver.exe'
browser = webdriver.Chrome(executable_path=path)
url = 'http://www.dxy.cn/bbs/thread/626626#626626'
browser.get(url)
time.sleep(1)
# 找到登录按钮
browser.find_element_by_xpath('//*[@class="activate-info-tip-btn"][2]').click()
time.sleep(1)
#通过xpath定位到返回电脑登录按钮,并点击
browser.find_element_by_xpath('//*[@class="login__tab j-tab"]').click()
time.sleep(1)
username = browser.find_element_by_name('username')
username.send_keys('你的账号')
password = browser.find_element_by_name('password')
password.send_keys('你的密码')
browser.find_element_by_xpath('//*[@class="button"]').click()
time.sleep(3)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,'lxml')
for i in soup.find_all("tbody"):
try:
name = i.find('div', class_="auth").get_text(strip=True)
content = i.find('td', class_="postbody").get_text(strip=True)
print(name + ":" + content)
except:
pass
if __name__ == '__main__':
main()