#coding:utf-8
# (必须加上第一句编码声明,不然会报错)
"""
优化目标:
1、封装代码;
2、优化部分代码。
"""
# 0、导入所需要的包
from selenium import webdriver
import time
import requests
from lxml import etree
# 1、打开目标网页,传入登录信息,实现自动登录
def LogIn():
driver_path = r"C:\Users\AppData\Local" \
r"\Google\Chrome\Application\chromedriver.exe"
browser = webdriver.Chrome(driver_path)
login_url = "https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F&sms=5"
browser.get(login_url)
time.sleep(1)
login_button = browser.find_element_by_id("TANGRAM__PSP_3__footerULoginBtn")
login_button.click()
input_telephone = browser.find_element_by_id("TANGRAM__PSP_3__userName")
phone = input("请输入您的登录号码:")
input_telephone.send_keys(phone)
time.sleep(1)
input_password = browser.find_element_by_id("TANGRAM__PSP_3__password")
password = input("请输入您的密码:")
input_password.send_keys(password)
time.sleep(1)
login_submit = browser.find_element_by_id("TANGRAM__PSP_3__submit")
login_submit.click()
print("****登录成功*****")
# 2、解析页面
def ParsePage():
start_page = (int(input("请输入您想要爬取的开始网页编号:")) - 1) * 50
end_page = (int(input("请输入您想要爬取的结束网页编号:")) - 1) * 50 + 1
page_numbers = [x for x in range(start_page, end_page, 50)]
for page_number in page_numbers:
try:
target_url = "https://tieba.baidu.com/f?kw=%E7%8E%AF%E4%BF%9D&ie=utf-8&pn=" + str(page_number)
response = requests.get(target_url)
text = response.text
# 4、对返回信息进行解析
html = etree.HTML(text)
titles = html.xpath("//*[@id=\"thread_list\"]/li/div/div[2]/div[1]/div[1]/a/text()")
contents_urls = html.xpath("//*[@id=\"thread_list\"]/li/div/div[2]/div[1]/div[1]/a/@href")
# 4.1 拼接得到完整的发帖内容链接
new_contents_urls = []
for new_contents_url in contents_urls:
if new_contents_url[0:2] == "/p":
new_contents_urls.append(new_contents_url)
# 4.2 获取完整用户名和发帖内容
authors = []
contents = []
for url in new_contents_urls:
try:
target_url = "https://tieba.baidu.com/" + url
response = requests.get(target_url)
text = response.text
html = etree.HTML(text)
author = html.xpath("//*[@id=\"j_p_postlist\"]/div[1]/div[1]/ul/li[3]/a/text()")
authors.append(author[0])
content = html.xpath("//cc//text()")
contents.append(content)
time.sleep(1)
except:
print("该帖子不存在!")
continue
# 4.3 对发帖内容整理,去除空格和换行符
# new_contents = [x.strip() for x in contents if x.strip() != '']
new_contents_1 = []
for i in range(len(contents)):
new_contents_2 = []
for j in range(len(contents[i])):
new_contents_2.append(contents[i][j].strip())
while "" in new_contents_2:
new_contents_2.remove("")
new_contents_1.append(new_contents_2)
# 5、将清洗后的数据保存在文档当中
page = page_number / 50 + 1
filename = "环保吧第%s页内容.csv" % page
with open(filename, "w", encoding="utf-8") as fp:
fp.write("发帖人,")
fp.write("发帖标题,")
fp.write("发帖内容")
fp.write("\n")
for i in range(len(authors)):
fp.write(authors[i])
fp.write(",")
fp.write(titles[i])
fp.write(",")
fp.write(str(new_contents_1[i]))
fp.write("\n")
fp.close()
print("第%s页解析成功!" % page)
except:
page = page_number / 50 + 1
print("第%s页解析失败!" % page)
continue
print("全部页面解析完成!")
# 3、保存数据
def MessageStore():
print("数据保存成功!")
pass
if __name__ == '__main__':
LogIn()
ParsePage()
平安实习—环保吧数据爬取
最新推荐文章于 2024-10-01 05:04:32 发布