我写的python程序

1.豆瓣top250

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re


def main():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
    for i in range(0, 20):
        url = f'https://book.douban.com/tag/%E5%95%86%E4%B8%9A?start={i*20}&type=S'
        response = requests.get(url=url, headers=headers, timeout=30)
        html_content = response.text
        soup = BeautifulSoup(html_content, 'lxml')
        p_html(soup)
        print(f'第{i}页爬取完毕')
        print(url)


data_info = {'图书名称':[], '图书作者':[], '图书出版时间':[], '评分':[], '评价人数':[], '简介':[]}


def p_html(soup):
    li_list = soup.select('.subject-list li')
    for li in li_list:
        name = re.findall(r'<div class="info">.*?title="(.*?)".*?</a>', str(li), re.S)[0]
        data_info['图书名称'].append(name)
        data_info['图书作者'].append(li.select('.info .pub')[0].text.split('/')[0].strip())
        data_info['图书出版时间'].append(li.select('.info .pub')[0].text.split('/')[-2].strip())
        data_info['评分'].append(float(li.select('.info .star span')[1].text.strip()))
        data_info['评价人数'].append(int(li.select('.info .star span')[2].text.replace('(', '').replace('人评价)', '').strip()))
        data_info['简介'].append(str(li.find('p')).replace('<p>', '').replace('</p>', ''))
    return data_info


if __name__ == '__main__':  # 调用函数
    main()
    book_info = pd.DataFrame(data_info)
    print(book_info.isnull())
    print(book_info.duplicated())
    book_info = book_info.dropna()
    book_info.to_excel('豆瓣图书排名.xlsx', encoding='utf-8', index=False)


2.boss直聘

from selenium import webdriver
from urllib import parse
import pandas as pd


datalist = {'工作名称':[],
            '工作地点':[],
            '薪资待遇':[],
            '经验学历':[],
            '技能标签':[],
            '公司名称':[],
            '公司类型':[],
            '福利待遇':[],
            '招聘链接':[]}


def get_job_info():
    lis = drive.find_elements_by_css_selector('.job-list li')
    for li in lis:
        # 招聘链接
        link = li.find_element_by_css_selector('.job-name a').get_attribute('href')
        datalist['招聘链接'].append(link)
        # 工作名称
        name = li.find_element_by_css_selector('.job-name a').text
        datalist['工作名称'].append(name)
        # 工作地点
        area = li.find_element_by_css_selector('.job-area').text
        datalist['工作地点'].append(area)
        # 公司名称
        company_name = li.find_element_by_css_selector('.company-text .name a').text
        datalist['公司名称'].append(company_name)
        # 公司类型
        company_type = li.find_element_by_css_selector('.company-text p a').text
        datalist['公司类型'].append(company_type)
        # 薪资待遇
        money = li.find_element_by_css_selector('.red').text
        datalist['薪资待遇'].append(money)
        # 经验学历
        exp = li.find_element_by_css_selector('.job-limit p').text
        datalist['经验学历'].append(exp)
        # 技能标签
        tags = li.find_elements_by_class_name('tag-item')
        add = []
        for tag in tags:
            if tag.text == '':
                break
            add.append(tag.text)
        tagadd = '/'.join(add)
        datalist['技能标签'].append(tagadd)
        # 福利待遇
        boon = li.find_element_by_css_selector('.info-desc').text
        datalist['福利待遇'].append(boon)
        # print(name,area,company_name,company_type,money,exp,tags,boon)
        # nonextpage = drive.find_element_by_css_selector(".page .disabled")
    return datalist


if __name__ == '__main__':
    drive = webdriver.Chrome()
    for page in range(1, 7):
        for i in range(103, 107):
            drive.get(f"https://www.zhipin.com/c100010000/e_{i}/?query=数据分析&page=1&ka=page-")
            drive.implicitly_wait(10)
            print(f'正在爬取e_{i}第{page}页的数据内容')
            get_job_info()
            try:
                drive.find_element_by_css_selector(".page .disabled")
                break
            except:
                drive.find_element_by_css_selector(".next").click()
    datalist = pd.DataFrame(datalist)
    datalist.to_excel('boss直聘数据分析岗位全国.xlsx', encoding='utf-8', index=False)
    drive.quit()

3.数据处理

import pandas as pd


boss = pd.read_excel(r"C:\Users\hwt\Desktop\数据分析求职\boss直聘数据分析岗位全国.xlsx")
city = boss['工作地点'].str.split('·').str[0]
boss.insert(loc=2, column='城市', value=city)
district = boss['工作地点'].str.split('·').str[1]
boss.insert(loc=3, column='区县', value=district)
area = boss['工作地点'].str.split('·').str[2]
boss.insert(loc=4, column='地址', value=area)
#print(city)
#print(district)
#print(area)
startsalary = boss['薪资待遇'].str.extract(r'(\d+)-').astype('int')*1000
boss.insert(loc=6, column='起薪', value=startsalary)
#print(startsalary)
exp = boss['经验学历'].str.extract(r'(\d-?\d*年)', expand = False)
boss.insert(loc=8, column='经验', value=exp)
degree = boss['经验学历'].str.extract(r'\d-?\d*[\u4e00-\u9fa5]+([\u4e00-\u9fa5]{2})', expand = False)
boss.insert(loc=9, column='学历', value=degree)
#print(exp)
#print(degree)
#print(boss['dup'])
#print(boss.info)

boss = boss.drop_duplicates(['工作名称', '薪资待遇', '公司名称'])
boss.fillna('')
boss.sort_values(by=['公司名称', '经验', '起薪'], ascending=[True, True, True], inplace= True)
boss.to_excel('boss直聘数据分析岗位全国cleaning.xlsx', encoding='utf-8', index=False)  

4.正则表达式

# -*- coding = utf-8 -*-

from bs4 import BeautifulSoup  # 网页解析
import re  # 正则表达式,进行文字匹配
import urllib.request  # 指定URL,获取网页数据
import urllib.error  # 指定URL,获取网页数据
import openpyxl  # 进行excel操作
import sqlite3  # 进行SQLITE数据库操作


def main():
    baseurl = 'https://movie.douban.com/top250?start='
    datalist = getData(baseurl)  # 爬取和解析源网页的数据
    saveData(datalist)  # 保存数据


# 影片详情
findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式对象,表示规则(字符串的模式)
# 影片海报
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)  # re.S让换行符包含在字符内
# 片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 影片评价
findJudge = re.compile(r'<span>(\d*)人评价</span>')
# 影片概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
# 影片相关内容
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)

# askURL('https://movie.douban.com/top250?start=0')


def getData(baseurl):  # 爬取网页
    datalist = []
    for i in range(0, 10):  # 调用获取页面信息的函数10次
        url = baseurl + str(i*25)
        html = askURL(url)  # 保存获取到的网页源码
        # 2.逐一解析数据
        soup = BeautifulSoup(html, 'html.parser')
        for item in soup.find_all('div', class_='item'):  # 查找符合要求的字符串形成列表
            # print(item)
            data = []  # 保存一部电影的所有信息
            item = str(item)

            # 影片详情链接
            link = re.findall(findLink, item)[0]  # re库用来通过正则表达式查找指定的字符串
            data.append(link)

            imgSrc = re.findall(findImgSrc, item)[0]
            data.append(imgSrc)

            titles = re.findall(findTitle, item)
            if len(titles) == 2:
                ctitle = titles[0]
                data.append(ctitle)
                otitle = titles[1].replace('/', '')
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(' ')

            rating = re.findall(findRating, item)[0]
            data.append(rating)

            judge = re.findall(findJudge, item)[0]
            data.append(judge)

            inq = re.findall(findInq, item)
            if len(inq) != 0:
                inq = inq[0].replace('。', '')
                data.append(inq)
            else:
                data.append(' ')

            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\\s+)?/>(\\s+)?', ' ', bd)
            bd = re.sub('/', ' ', bd)
            data.append(bd.strip())  # 去掉前后空格

            datalist.append(data)  # 把处理好的一部电影信息放入datalist

    print(datalist)
    return datalist


def askURL(url):  # 得到指定一个url的网页内容
    head = {  # 模拟浏览器头部信息向豆瓣服务器发送消息
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}  # 用户代理表示告诉豆瓣我们是什么类型的机器、浏览器
    request = urllib.request.Request(url, headers=head)
    html = ''
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')  # 对获取到的网页源码进行utf-8解码
    except urllib.error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)
    return html


def saveData(datalist):  # 将数据保存到Excel
    workbook = openpyxl.Workbook()
    # wb=openpyxl.Workbook(encoding='utf-8') 创建一个Excel工作簿,字符编码可输出中文
    worksheet = workbook.active
    worksheet.title = '豆瓣电影Top250'
    col = ('电影详情链接', '图片链接', '影片中文名', '影片外文名', '评分', '评价人数', '概况', '相关信息')
    for i in range(len(col)):
        worksheet.cell(1, i+1, col[i])  # 写入第一行数据,即列名称
    for i in range(0, 250):
        data = datalist[i]  # 从第二行开始写入数据
        for j in range(len(col)):
            worksheet.cell(i+2, j+1, data[j])  # 从第二行第一列开始写入
    workbook.save('豆瓣电影Top250数据.xlsx')  # 将数据保存为Excel表格


if __name__ == '__main__':  # 调用函数
    main()

5.当当

import requests
import pandas as pd
from bs4 import BeautifulSoup
def main():
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
    for i in range(1, 26):
        url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{i}'
        response = requests.get(url=url, headers=headers, timeout=30)
        html_content = response.text
        soup = BeautifulSoup(html_content, 'lxml')
        p_html(soup)
        print(f'第{i}页爬取完毕')


data_info = {'图书排名':[], '图书名称':[], '图书作者':[], '图书出版时间':[], '图书出版社':[], '图书价格':[]}
def p_html(soup):
    li_list = soup.select('.bang_list li')
    for li in li_list:
        data_info['图书排名'].append(li.select('.list_num ')[0].text.replace('.', ''))
        data_info['图书名称'].append(li.select('.name a')[0].text)
        data_info['图书作者'].append(li.select('.publisher_info ')[0].select('a')[0].text)
        data_info['图书出版时间'].append(li.select('.publisher_info span')[0].text)
        data_info['图书出版社'].append(li.select('.publisher_info ')[1].select('a')[0].text)
        data_info['图书价格'].append(float(li.select('.price .price_n')[0].text.replace('¥', '')))
    return data_info

if __name__ == '__main__':  # 调用函数
    main()
    book_info = pd.DataFrame(data_info)
    print(book_info.isnull())
    print(book_info.duplicated())
    book_info = book_info.dropna()
    book_info.to_excel('当当网图书销售排行.xlsx', encoding='utf-8', index=False)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值