爬虫实现对于百度文库内容的爬取

最新推荐文章于 2024-07-24 17:16:09 发布

种起水稻

最新推荐文章于 2024-07-24 17:16:09 发布

阅读量7.7k

点赞数 18

分类专栏：爬虫文章标签： python 正则表达式

本文链接：https://blog.csdn.net/qq_43624038/article/details/111826605

版权

爬虫专栏收录该内容

2 篇文章 2 订阅

订阅专栏

众所周知，百度文库不允许非VIP直接复制它的内容，我这种穷鬼对此深恶痛绝，所以决定用python实现对于其内容的爬取

# coding =utf-8
import tkinter as tk
import re  # 正则表达式
import urllib

import requests


window = tk.Tk()
url = ""

window.title('百度文库爬虫')
window.geometry('500x300')
baseNum = tk.Label(window, text='请输入网址：')
baseNum.pack()
base_text = tk.StringVar()
base = tk.Entry(window, textvariable=base_text)
base.pack()
def xxxx():

    print("wo you shuchu")
    url = base_text.get()

    content_list = kaishi(url)
    f = open(r'C:\Users\Administrator\Desktop\123.txt', 'a+')




    for i in range(0, len(content_list)):
        f.write(str(content_list[i]))



    print("xxxxx")

    print(str(content_list))



def main():

    tk.Button(window, text="生成桌面文件", command=xxxx).pack()

    tk.Button(window, text="退出1", command=window.quit).pack()




    window.mainloop()




findgupiao = re.compile('">(.*?)</p>')





def kaishi(url):
    headers = {
        xxxxxxxxxx
    }

    request = urllib.request.Request(url, headers=headers)

    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('unicode_escape')
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)

    content_list=[]
    content_list = re.findall('"c":"(.*?)","p"', html)







    return content_list

    # with open("rsp.html", "w+", encoding="utf-8")as f:
    #     f.write(session.get(url1).text)


if __name__ == "__main__":
    main()