平安实习—环保吧数据爬取

#coding:utf-8
# (必须加上第一句编码声明,不然会报错)

"""
优化目标:
1、封装代码;
2、优化部分代码。
"""

# 0、导入所需要的包
from selenium import webdriver
import time
import requests
from lxml import etree


# 1、打开目标网页,传入登录信息,实现自动登录
def LogIn():
    driver_path = r"C:\Users\AppData\Local" \
                  r"\Google\Chrome\Application\chromedriver.exe"
    browser = webdriver.Chrome(driver_path)
    login_url = "https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F&sms=5"
    browser.get(login_url)
    time.sleep(1)
    login_button = browser.find_element_by_id("TANGRAM__PSP_3__footerULoginBtn")
    login_button.click()
    input_telephone = browser.find_element_by_id("TANGRAM__PSP_3__userName")
    phone = input("请输入您的登录号码:")
    input_telephone.send_keys(phone)
    time.sleep(1)
    input_password = browser.find_element_by_id("TANGRAM__PSP_3__password")
    password = input("请输入您的密码:")
    input_password.send_keys(password)
    time.sleep(1)
    login_submit = browser.find_element_by_id("TANGRAM__PSP_3__submit")
    login_submit.click()
    print("****登录成功*****")

 # 2、解析页面
def ParsePage():
    start_page = (int(input("请输入您想要爬取的开始网页编号:")) - 1) * 50
    end_page = (int(input("请输入您想要爬取的结束网页编号:")) - 1) * 50 + 1
    page_numbers = [x for x in range(start_page, end_page, 50)]
    for page_number in page_numbers:
        try:
            target_url = "https://tieba.baidu.com/f?kw=%E7%8E%AF%E4%BF%9D&ie=utf-8&pn=" + str(page_number)
            response = requests.get(target_url)
            text = response.text
            # 4、对返回信息进行解析
            html = etree.HTML(text)
            titles = html.xpath("//*[@id=\"thread_list\"]/li/div/div[2]/div[1]/div[1]/a/text()")
            contents_urls = html.xpath("//*[@id=\"thread_list\"]/li/div/div[2]/div[1]/div[1]/a/@href")

            # 4.1 拼接得到完整的发帖内容链接
            new_contents_urls = []
            for new_contents_url in contents_urls:
                if new_contents_url[0:2] == "/p":
                    new_contents_urls.append(new_contents_url)

            # 4.2 获取完整用户名和发帖内容
            authors = []
            contents = []
            for url in new_contents_urls:
                try:
                    target_url = "https://tieba.baidu.com/" + url
                    response = requests.get(target_url)
                    text = response.text
                    html = etree.HTML(text)
                    author = html.xpath("//*[@id=\"j_p_postlist\"]/div[1]/div[1]/ul/li[3]/a/text()")
                    authors.append(author[0])
                    content = html.xpath("//cc//text()")
                    contents.append(content)
                    time.sleep(1)
                except:
                    print("该帖子不存在!")
                    continue

            # 4.3 对发帖内容整理,去除空格和换行符
            # new_contents = [x.strip() for x in contents if x.strip() != '']
            new_contents_1 = []
            for i in range(len(contents)):
                new_contents_2 = []
                for j in range(len(contents[i])):
                    new_contents_2.append(contents[i][j].strip())
                while "" in new_contents_2:
                    new_contents_2.remove("")
                new_contents_1.append(new_contents_2)

            # 5、将清洗后的数据保存在文档当中
            page = page_number / 50 + 1
            filename = "环保吧第%s页内容.csv" % page
            with open(filename, "w", encoding="utf-8") as fp:
                fp.write("发帖人,")
                fp.write("发帖标题,")
                fp.write("发帖内容")
                fp.write("\n")
                for i in range(len(authors)):
                    fp.write(authors[i])
                    fp.write(",")
                    fp.write(titles[i])
                    fp.write(",")
                    fp.write(str(new_contents_1[i]))
                    fp.write("\n")
                fp.close()
            print("第%s页解析成功!" % page)
        except:
            page = page_number / 50 + 1
            print("第%s页解析失败!" % page)
            continue
    print("全部页面解析完成!")

# 3、保存数据
def MessageStore():
    print("数据保存成功!")
    pass


if __name__ == '__main__':
    LogIn()
    ParsePage()





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值