老菜鸟第二个爬虫,纪念一下!

from urllib import request
from bs4 import BeautifulSoup
import xlwt
import time


links = []
webs = []
ydm = []
yddh = []
yddz = []


def website():
    ws = input('输入网址:')
    links.append(ws)
    return ws


def liulan():
    url = website()
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
    Safari/537.36 Edge/18.17763'}
    rep = request.Request(url,headers=headers)
    rsp = request.urlopen(rep)
    result = rsp.read()
    soup = BeautifulSoup(result, 'html.parser')
    tags = soup.find(attrs={'class':'page'})
    for i in tags:
        if i.name == 'a':
            links.append(i['href'])
    del links[1]
    

def getwebs():
    for i in links:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
        Safari/537.36 Edge/18.17763'}
        rep = request.Request(i,headers=headers)
        rsp = request.urlopen(rep)
        result = rsp.read()
        soup = BeautifulSoup(result, 'html.parser')
        ul = soup.ul
        for i in ul.contents:
            if i.name == 'li':
                web = i.strong.a.attrs['href']
                webs.append(web)
    

def info():
    for i in webs:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 \
            Safari/537.36 Edge/18.17763'}
            rep = request.Request(i,headers=headers)
            rsp = request.urlopen(rep)
            result = rsp.read()
            soup = BeautifulSoup(result, 'html.parser')
            tags = soup.find(attrs={'class':'company'})
            name = str(tags.h1.string)
            print(name)
            ydm.append(name)
            phone = str(tags.p.text)
            phone = phone.split()
            phone = phone[1]
            print(phone)
            yddh.append(phone)
            cleartext = soup.ul.contents
            zhenghe1 = []
            zhenghe2 = []
            for x in cleartext[0:5]:
                if x.name == 'li':
                    k = str(x.text)
                    zhenghe1.append(k)
            for s in zhenghe1:
                ss = s[5:]
                zhenghe2.append(ss)
            sss = zhenghe2[0] + zhenghe2[1]
            yddz.append(sss)
            print(sss)
            # time.sleep(0.9)
        except:
            print('HTTP Error 404: Not Found')


def excel():
    a = 0
    b = 0

    c = 0
    d = 1

    e = 0
    f = 2

    sheetname = links[0]
    sheetname = sheetname[26:-5]
    ex = xlwt.Workbook()
    ex = xlwt.Workbook()
    sheet = ex.add_sheet(f'{sheetname}')
    for i in ydm:
        sheet.write(a, b, i)
        a += 1
    for ii in yddh:
        sheet.write(c, d, ii)
        c += 1
    for iii in yddz:
        sheet.write(e, f, iii)
        e += 1

    ex.save(f'{sheetname}.xls')

liulan()
getwebs()
info()
excel()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值