老菜鸟第二个爬虫,纪念一下!

最新推荐文章于 2024-01-11 17:26:03 发布

HopKinsXX

最新推荐文章于 2024-01-11 17:26:03 发布

阅读量132

点赞数

本文链接：https://blog.csdn.net/HopJins/article/details/86532422

版权

from urllib import request
from bs4 import BeautifulSoup
import xlwt
import time


links = []
webs = []
ydm = []
yddh = []
yddz = []


def website():
    ws = input('输入网址:')
    links.append(ws)
    return ws


def liulan():
    url = website()
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
    Safari/537.36 Edge/18.17763'}
    rep = request.Request(url,headers=headers)
    rsp = request.urlopen(rep)
    result = rsp.read()
    soup = BeautifulSoup(result, 'html.parser')
    tags = soup.find(attrs={'class':'page'})
    for i in tags:
        if i.name == 'a':
            links.append(i['href'])
    del links[1]
    

def getwebs():
    for i in links:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
        Safari/537.36 Edge/18.17763'}
        rep = request.Request(i,headers=headers)
        rsp = request.urlopen(rep)
        result = rsp.read()
        soup = BeautifulSoup(result, 'html.parser')
        ul = soup.ul
        for i in ul.contents:
            if i.name == 'li':
                web = i.strong.a.attrs['href']
                webs.append(web)
    

def info():
    for i in webs:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
            Safari/537.36 Edge/18.17763'}
            rep = request.Request(i,headers=headers)
            rsp = request.urlopen(rep)
            result = rsp.read()
            soup = BeautifulSoup(result, 'html.parser')
            tags = soup.find(attrs={'class':'company'})
            name = str(tags.h1.string)
            print(name)
            ydm.append(name)
            phone = str(tags.p.text)
            phone = phone.split()
            phone = phone[1]
            print(phone)
            yddh.append(phone)
            cleartext = soup.ul.contents
            zhenghe1 = []
            zhenghe2 = []
            for x in cleartext[0:5]:
                if x.name == 'li':
                    k = str(x.text)
                    zhenghe1.append(k)
            for s in zhenghe1:
                ss = s[5:]
                zhenghe2.append(ss)
            sss = zhenghe2[0] + zhenghe2[1]
            yddz.append(sss)
            print(sss)
            # time.sleep(0.9)
        except:
            print('HTTP Error 404: Not Found')


def excel():
    a = 0
    b = 0

    c = 0
    d = 1

    e = 0
    f = 2

    sheetname = links[0]
    sheetname = sheetname[26:-5]
    ex = xlwt.Workbook()
    ex = xlwt.Workbook()
    sheet = ex.add_sheet(f'{sheetname}')
    for i in ydm:
        sheet.write(a, b, i)
        a += 1
    for ii in yddh:
        sheet.write(c, d, ii)
        c += 1
    for iii in yddz:
        sheet.write(e, f, iii)
        e += 1

    ex.save(f'{sheetname}.xls')

liulan()
getwebs()
info()
excel()

HopKinsXX

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
老菜鸟第二个爬虫,纪念一下!

from urllib import requestfrom bs4 import BeautifulSoupimport xlwtimport timelinks = []webs = []ydm = []yddh = []yddz = []def website(): ws = input('输入网址:') links.append(ws) ...
复制链接

扫一扫