BS4+Openpyxl循环爬取114网站黄页信息存入excel + tinker界面输入(开源代码)

2 篇文章 0 订阅
1 篇文章 0 订阅

使用beautifulSoup框架爬取114黄页数据。

代码开源在gitee地址: https://gitee.com/aismarter/ScrapySpider_bs4_openpyxl_tinker

github地址: https://github.com/Aismarter/ScrapySpider_BS4_openpyxl_tinker

分析网站

首先打开网页,分析爬取网页的元素。

点击选中需要爬取的地方-鼠标右键-检查元素。

 

 

 检测可见,要爬取的内容定位于:

<td id="tdDetails" class="text" height="500" valign="top">释放数据潜能,激...。</td>

在td块下。

之后,对分析结果进行爬取。

循环爬取实现

设置一个循环,手动的输入不同网站爬取。

但手动输入失误过多,使用tkinter,图像化输入。

 启一个图像化界面,图像化输入。

图像化测试代码如下:

# tktest.py

import tkinter as tk


class Window:
    def __init__(self, handle):
        self.win = handle
        self.createwindow()
        self.run()

    def createwindow(self):
        self.win.geometry('400x400')
        # label 1
        self.label_text = tk.StringVar()
        self.label_text.set("----")
        self.lable = tk.Label(self.win,
                              textvariable=self.label_text,
                              font=('Arial', 11), width=15, height=2)
        self.lable.pack()

        # text_contrl
        self.entry_text = tk.StringVar()
        self.entry = tk.Entry(self.win, textvariable=self.entry_text, width=30)
        self.entry.pack()

        # button
        self.button = tk.Button(self.win, text="set label to text", width=15, height=2, command=self.setlabel)
        self.button.pack()

    def setlabel(self):
        print(self.entry_text.get())
        self.label_text.set(self.entry_text.get())

    def get_input(self):
        newinfo = self.entry_text.get()
        print("this is new information:  " + newinfo)
        return newinfo

    def run(self):
        try:
            self.win.mainloop()
        except Exception as e:
            print("*** exception:\n".format(e))


def input_info():
    window = tk.Tk()
    window.title('hello tkinter')
    Window(window).run()




if __name__ == "__main__":
    input_info()

 

最终爬取结果

 最终代码

import requests
import os
from tktest import *
from bs4 import BeautifulSoup
from openpyxl import Workbook
import openpyxl


def get_information(url):
    # url = "http://www.ygsoft.com/BigData/index.html?from=baidu"
    page = requests.get(url)
    # print(page.status_code)
    # print(page.content)
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())
    # details = soup.find_all('td', class_="text")
    # print(details.get_text())
    # return details
    name = soup.find_all('td', class_="lian-right")
    name_mess = []
    for name1 in name:
        mess = str(name1.get_text())
        print(mess)
        # print(name1.get_text())
        name_mess.append(mess)
    print(name_mess)
    return name_mess


def get_information_name(url):
    # url = "http://www.ygsoft.com/BigData/index.html?from=baidu"
    page = requests.get(url)
    # print(page.status_code)
    # print(page.content)
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())
    # details = soup.find_all('td', class_="text")
    # print(details.get_text())
    # return details
    name = soup.find_all('td', class_="top2")
    name_mess = []
    for name1 in name:
        mess = str(name1.get_text())
        print(mess)
        # print(name1.get_text())
        name_mess.append(mess)
    print(name_mess)
    return name_mess


def get_information_company(url):
    # url = "http://www.ygsoft.com/BigData/index.html?from=baidu"
    page = requests.get(url)
    # print(page.status_code)
    # print(page.content)
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())
    # details = soup.find_all('td', class_="text")
    # print(details.get_text())
    # return details
    name = soup.find_all('td', class_="text")
    name_mess = []
    for name1 in name:
        mess = str(name1.get_text())
        print(mess)
        # print(name1.get_text())
        name_mess.append(mess)
    print(name_mess)
    return name_mess


def store_into_excel(filename,titleList, company ,name, wb, ws1, n):
    r = 1
    if n is not 1:
        r = n + n -1
    else:
        r = n
    # 设置表头
    # titleList = ['远光大数据平台']
    c = 1
    for n1 in company:
        ws1.cell(r, c + 2, n1)
    for row in range(len(titleList)):
        ws1.cell(row=r, column=c, value=titleList[row])
        c += 1
    c = 1
    # 填写表内容
    r += 1
    for name1 in name:
        ws1.cell(r, c+2, name1)
        c += 1
    c = 1




    wb.save(filename=filename)

def main():
    # 将数据写入Excel
    wb = Workbook()
    # 设置Excel文件名
    filename = '黄页数据.xlsx'
    # 新建一个表
    ws1 = wb.active
    n = 1
    while True:
        print("\n\n\n**********进行第" + str(n) + "次爬取")
        print("网址:")
        url = Window(tk.Tk()).get_input()
        print(type(url))
        print("以获取到爬取连接: " + url)
        print("平台:")
        # titleList.append(Window(tk.Tk()).get_input())
        titleList = get_information_name(url)
        company = get_information_company(url)
        print("正在爬取数据")
        name = get_information(url)
        store_into_excel(filename, titleList, company, name, wb, ws1, n)
        # try:
        #     name = get_information(url)
        #     store_into_excel(filename,  name, wb, ws1, n)
        # except:
        #     print("输入有误。。。")
        print("输入任意值继续,输入q键退出。。")
        ans = Window(tk.Tk()).get_input()
        n += 1
        if ans is not 'q':
            continue
        else:
            break



if __name__ == "__main__":

    main()





 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值