爬虫多个基础实例

输入名字爬取百度搜索的网页源码(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 17:13
# @Author : 李如旭
# @File :111.py
# @Software: PyCharm

import requests

name = input("请输出要搜索人的名称:")

url = "https://www.baidu.com/s?wd=name"

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
response = requests.get(url=url,headers=head)

#print(response)
html = response.text
print(html)
response.close()

获取百度翻译结果(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:19
# @Author : 李如旭
# @File :baidufanyi.py
# @Software: PyCharm


import requests

url = "https://fanyi.baidu.com/sug"

word = input("请输入要翻译的英文单词:")

dat = {
    "kw": word
}
resp = requests.post(url,data=dat)

print(resp.json())

resp.close()

爬取豆瓣电影排行榜第一页(初级)

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 19:33
# @Author : 李如旭
# @File :paihang.py
# @Software: PyCharm

import requests

url = "https://movie.douban.com/j/chart/top_list"

param = {
"type": "24",
"interval_id": "100:90",
"action": "",
"start": "0",
"limit": "20",
}

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}

resp = requests.get(url=url,params=param,headers=head)

print(resp.json())

resp.close()

爬取豆瓣电影TOP250 名字,年份,评分,评价人数

# -*- coding: UTF-8 -*-
# @Time : 2021/5/31 21:10
# @Author : 李如旭
# @File :豆瓣排行榜.py
# @Software: PyCharm

import re
import requests
import csv

url ="https://movie.douban.com/top250"

head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}
rsp = requests.get(url=url,headers=head)
html = rsp.text

#print(html)

#爬取名字

# # #法一:
# name = re.finditer(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',html,re.DOTALL)
# for i in name:
#     print(i.group("name"))
#
# # #法二:
# obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)</span>',re.S)
# result = obj.finditer(html)
# for it in result :
#     print(it.group("name"))
#
# #法三:
#
# name = re.findall(r'<li>.*?<div class="item">.*?<span class="title">(.*?)</span>',html,re.S)
# print(name)



#爬取名字、年份、评分、
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
                 r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?'
                 r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
                 r'<span>(?P<number>.*?)</span>', re.S)
result = obj.finditer(html)


f = open("data.csv",mode="w")
csvwriter = csv.writer(f)

for it in result:
    # print(it.group("name"))
    # print(it.group("year").strip())
    # print(it.group("score"))
    # print(it.group("number"))
    dic = it.groupdict()
    dic['year'] = dic['year'].strip()
    csvwriter.writerow(dic.values())

f.close()


爬取百度图片:

# -*- coding: UTF-8 -*-
# @Time : 2021/6/2 9:59
# @Author : 李如旭
# @File :百度图片.py
# @Software: PyCharm
import requests
import re
import os
word = input("请输入搜索关键词(可以是人名,地名等): ")
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=30'
head = {
        'Access-Control-Allow-Credentials': 'true',
        'Connection': 'keep-alive',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
resp = requests.get(url=url,headers=head)
resp.encoding = "utf-8"
html = resp.text
# print(html)

urls = re.findall('"thumbURL":"(.*?)"',html)
num = 0
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可:')
os.mkdir(file)
for i in urls:
     pic = requests.get(i, timeout=7)
     string = file + r'\\' + word + '_' + str(num) + '.jpg'
     fp = open(string, 'wb')
     fp.write(pic.content)
     num = num + 1
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值