Python爬虫QQ空间

#coding:utf-8

import time
from selenium import webdriver
from lxml import etree

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

friend = '' # 目的QQ号,目的QQ空间要求允许被访问
user = ''  # 你的QQ号
pw = ''  # 你的QQ密码

driver = webdriver.Chrome(executable_path='/Users/jiwu/Downloads/chromedriver')

driver.maximize_window()

driver.get("http://i.qq.com")

driver.switch_to.frame("login_frame")

driver.find_element_by_id("switcher_plogin").click()

driver.find_element_by_id("u").send_keys(user)

driver.find_element_by_id("p").send_keys(pw)

driver.find_element_by_id("login_button").click()

driver.switch_to.default_content()


driver.get("http://user.qzone.qq.com/" + friend + "/311")

next_num = 0 
while True:

    
        for i in range(1,6):
            height = 20000*i
            strWord = "window.scrollBy(0,"+str(height)+")"
            driver.execute_script(strWord)
            time.sleep(4)

        driver.switch_to.frame("app_canvas_frame")
        selector = etree.HTML(driver.page_source)
        divs = selector.xpath('//*[@id="msgList"]/li/div[3]')

        with open('qq_word.txt','a') as f:
            for div in divs:
                qq_name = div.xpath('./div[2]/a/text()')
                qq_content = div.xpath('./div[2]/pre/text()')
                qq_time = div.xpath('./div[4]/div[1]/span/a/text()')
                qq_name = qq_name[0] if len(qq_name)>0 else ''
                qq_content = qq_content[0] if len(qq_content)>0 else ''
                qq_time = qq_time[0] if len(qq_time)>0 else ''
                print(qq_name,qq_time,qq_content)
                f.write(qq_content+"\n")

        if driver.page_source.find('pager_next_' + str(next_num)) == -1:
         break

        driver.find_element_by_id('pager_next_' + str(next_num)).click()

        next_num += 1

        driver.switch_to.parent_frame()

生成词云:

#coding:utf-8

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba

def create_word_cloud(filename):
    text= open("{}.txt".format(filename)).read()
    wordlist = jieba.cut(text, cut_all=True)
    wl = " ".join(wordlist)

    wc = WordCloud(
       background_color="white",
       max_words=2000,
       font_path='/System/Library/Fonts/PingFang.ttc',
       height= 1200,
       width= 1600,
       max_font_size=100,
       random_state=30,
    )

    myword = wc.generate(wl) 
    plt.imshow(myword)
    plt.axis("off")
    plt.show()
    wc.to_file('py_book.png')

if __name__ == '__main__':
    create_word_cloud('qq_word')

转载于:https://www.cnblogs.com/131li/p/8933562.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值