【Python爬虫简单实现】

爬取起点小说
import requests
from bs4 import BeautifulSoup
import bs4


path="3.txt" # 这里目录也可以改成绝对目录
fw=open(path,'w',encoding='utf-8')	# open函数要加上编码为'utf-8'不然会乱码

url = "https://book.qidian.com/info/1015323848#Catalog"
r=requests.get(url,timeout=30)
r.raise_for_status() # 状态码的检测
r.encoding=r.apparent_encoding

soup = BeautifulSoup(r.text, 'html.parser')
for li in soup.find_all('ul')[3].children:	# 标签树下行遍历,试了一下发现也可以把.children去掉
    if isinstance(li, bs4.element.Tag): # 如果遍历的lis是一个bs4的tag标签,就处理它
        chapter = li.a
        # print(chapter)
        chapter_url = 'https:%s' % chapter.get('href')
        #chapter_title=chapter.contents # ->['第一节 龙城']
        chapter_title=chapter.text      # ->第一节 龙城
        # print(chapter_title)
        # print(chapter_url)
        chapter_response=requests.get(chapter_url)
        chapter_response.encoding=chapter_response.apparent_encoding
        chapter_html=chapter_response.text
        chapter_soup=BeautifulSoup(chapter_html,'html.parser')
        chapter_tag=chapter_soup.find('div',class_='read-content j_readContent')
        fw.write(chapter_title)
        fw.write(chapter_tag.text)

fw.close()

这里写的很乱,没有整理,只是为了实现这个功能,最后成功把爬取的数据写入了文件,如图
在这里插入图片描述

爬取一张图片

https://wx4.sinaimg.cn/mw690/8d05b653ly1g4n0elm7axj20j60b475b.jpg

import requests
path='D://abc.jpg'  # 保存在D盘下,以abc.jpg命名
url='https://wx4.sinaimg.cn/mw690/8d05b653ly1g4n0elm7axj20j60b475b.jpg'
# 以原文件名命名
# roo='D://'
# path=root+url.split('/')[-1]
r=requests.get(url)
with open(path,'wb') as f:
    f.write(r.content)

首先打开一个文件,将url返回的内容写到这个路径中,r.content返回的是二进制形式;

爬取天猫Ipad商品信息
import requests
import re
import json

def getHtml(url) :
    try:
        reponse = requests.get(url)
        if reponse.status_code != 200:
            return None
        else :
            return reponse.text
    except:
        return None

def getInf(html):
    pattern =re.compile('<div class="product.*?<a href="(.*?)".*?</b>(.*?)</em>.*?title="(.*?)".*?<a.*?>(.*?)</a>.*?</div>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'href':item[0],
            'price':item[1],
            'title':item[2],
            'shop':item[3]
        }

def writeFile(content):
    with open('3.txt', 'a', encoding = 'utf-8') as f:
        f.write(json.dumps(content, ensure_ascii = False) + '\n')
        f.close()

def main():
    url ='https://list.tmall.com/search_product.htm?q=ipad&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'
    html = getHtml(url)
    for item in getInf(html):
        print(item)
        writeFile(item)

if __name__ == '__main__':
    main()
爬取2020USTC研究生拟录取名单

下面的代码只给出了一部分关键的部分的,需要提前获取官网上拟录取公告的html,之后保存为文本文件;

import requests
import re
import json
import pandas as pd
import sys
import os
# print('当前 Python 解释器路径:')
# print(sys.executable)

def getInf(html):
    pattern = re.compile('<tr style=\'height:14.15pt.*?<td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width=.*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?</tr>', re.S)
                                    
    items = re.findall(pattern, html)
    print(type(items))
    result = []
    for item in items:
        result.append((item))
    df = pd.DataFrame(data=result[1:],columns = result[0])
    print(set(df['拟录取单位']))
    ke = ['计算机科学与技术学院', '软件学院']
    df = df.drop(index = df[~df['拟录取单位'].isin(ke)].index, axis=0)
    print(df)
    df = df.sort_values(by = ['拟录取单位','拟录取专业','总分'],ascending=[True,False,False])
    print(len(df))
    fil = 'D:/2020Ustc拟录取.xlsx'
    if os.path.exists(fil):
        os.remove(fil)
    df.to_excel('D:/2020Ustc拟录取.xlsx')

def writeFile(content):
    with open('3.txt', 'a', encoding = 'utf-8') as f:
        f.write(json.dumps(content, ensure_ascii = False) + '\n')
        f.close()

def main():
    # with open('4.txt', 'w', encoding = 'utf-8') as f:
    #     f.write(html)
    #     f.close()
    # for item in getInf(html):
    #     print(item)
    #     writeFile(item)
    #s = open("4.txt",encoding='utf-8').read()
    #getInf(s)

if __name__ == '__main__':
    main()
爬取USTC2021研究生拟录取

发现与去年的格式相比有了一点变化,就对正表表达式做了一点修改;

import requests
import re
import json
import pandas as pd
import sys
import os
# print('当前 Python 解释器路径:')
# print(sys.executable)

def getInf(html):
    pattern = re.compile('<tr style=\'mso-yfti-irow.*?\
    <td width="9%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="3%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="5%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="21%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="18%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="5%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="6%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="4%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="3%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="4%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?\
    <td width="4%".*?<p class=MsoNormal.*?<span.*?>(.*?)</span>.*?</tr>', re.S)
                                    
    items = re.findall(pattern, html)
    print(type(items))
    print(items)
    result = []
    for item in items:
        result.append((item))
    print(result)
    df = pd.DataFrame(data=result[1:],columns = result[0])
    print(set(df['拟录取单位']))
    print(df)
    df = df.sort_values(by = ['拟录取单位','拟录取专业','总分'],ascending=[True,False,False])
    print(len(df))
    fil = 'D:/2020Ustc拟录取.xlsx'
    if os.path.exists(fil):
        os.remove(fil)
    df.to_excel('D:/2021Ustc拟录取.xlsx')

def writeFile(content):
    with open('3.txt', 'a', encoding = 'utf-8') as f:
        f.write(json.dumps(content, ensure_ascii = False) + '\n')
        f.close()

def main():
    s = open("4.txt",encoding='utf-8').read()
    getInf(s)
    print(2**3)

if __name__ == '__main__':
    url = "http://search.gradschool.ustc.edu.cn/download/yzbgs/2021tkssgs.htm"
    kv={'User-Agent':'Mozilla/5.0'} 
    reponse = requests.get(url, headers=kv)
    reponse.encoding = 'gb2312'
    html = reponse.text
    with open('4.txt', 'w', encoding = 'utf-8') as f:
        f.write(html)
        f.close()
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值