爬虫多个基础实例

最新推荐文章于 2024-07-10 09:15:55 发布

离落想AC

最新推荐文章于 2024-07-10 09:15:55 发布

阅读量232

点赞数 1

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/xiaotiancaililuo/article/details/117425977

版权

爬虫专栏收录该内容

11 篇文章 0 订阅

订阅专栏

输入名字爬取百度搜索的网页源码(初级）

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 17:13
# @Author : 李如旭
# @File ：111.py
# @Software: PyCharm

import requests

name = input("请输出要搜索人的名称：")

url = "https://www.baidu.com/s?wd=name"

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
response = requests.get(url=url,headers=head)

#print(response)
html = response.text
print(html)
response.close()

获取百度翻译结果（初级）

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:19
# @Author : 李如旭
# @File ：baidufanyi.py
# @Software: PyCharm


import requests

url = "https://fanyi.baidu.com/sug"

word = input("请输入要翻译的英文单词：")

dat = {
    "kw": word
}
resp = requests.post(url,data=dat)

print(resp.json())

resp.close()

爬取豆瓣电影排行榜第一页（初级）

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:33
# @Author : 李如旭
# @File ：paihang.py
# @Software: PyCharm

import requests

url = "https://movie.douban.com/j/chart/top_list"

param = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
}

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}

resp = requests.get(url=url,params=param,headers=head)

print(resp.json())

resp.close()

爬取豆瓣电影TOP250 名字，年份，评分，评价人数

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 21:10
# @Author : 李如旭
# @File ：豆瓣排行榜.py
# @Software: PyCharm

import re
import requests
import csv

url ="https://movie.douban.com/top250"

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
rsp = requests.get(url=url,headers=head)
html = rsp.text

#print(html)

#爬取名字

# # #法一：
# name = re.finditer(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',html,re.DOTALL)
# for i in name:
#     print(i.group("name"))
#
# # #法二：
# obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',re.S)
# result = obj.finditer(html)
# for it in result :
#     print(it.group("name"))
#
# #法三：
#
# name = re.findall(r'<li>.*?<div class="item">.*?<span class="title">(.*?)</span>',html,re.S)
# print(name)



#爬取名字、年份、评分、
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
                 r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?'
                 r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
                 r'<span>(?P<number>.*?)</span>', re.S)
result = obj.finditer(html)


f = open("data.csv",mode="w")
csvwriter = csv.writer(f)

for it in result:
    # print(it.group("name"))
    # print(it.group("year").strip())
    # print(it.group("score"))
    # print(it.group("number"))
    dic = it.groupdict()
    dic['year'] = dic['year'].strip()
    csvwriter.writerow(dic.values())

f.close()

爬取百度图片：

# -*- coding: UTF-8 -*-
# @Time : 2021/6/2 9:59
# @Author : 李如旭
# @File ：百度图片.py
# @Software: PyCharm
import requests
import re
import os
word = input("请输入搜索关键词(可以是人名，地名等): ")
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=30'
head = {
        'Access-Control-Allow-Credentials': 'true',
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
resp = requests.get(url=url,headers=head)
resp.encoding = "utf-8"
html = resp.text
# print(html)

urls = re.findall('"thumbURL":"(.*?)"',html)
num = 0
file = input('请建立一个存储图片的文件夹，输入文件夹名称即可:')
os.mkdir(file)
for i in urls:
     pic = requests.get(i, timeout=7)
     string = file + r'\\' + word + '_' + str(num) + '.jpg'
     fp = open(string, 'wb')
     fp.write(pic.content)
     num = num + 1

离落想AC

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫多个基础实例

输入名字爬取百度搜索的网页源码(初级）# -*- coding: UTF-8 -*-# @Time : 2021/5/31 17:13# @Author : 李如旭# @File ：111.py# @Software: PyCharmimport requestsname = input("请输出要搜索人的名称：")url = "https://www.baidu.com/s?wd=name"head = {"User-Agent": "Mozilla/5.0 (Windows
复制链接

扫一扫