python判断火车票座位代码_python 火车票爬取代码

1、根据搜索词下载百度图片:

# -*- coding: utf-8 -*-

"""根据搜索词下载百度图片"""

import re

import sys

import urllib

import requests

def get_onepage_urls(onepageurl):

"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""

if not onepageurl:

print('已到最后一页, 结束')

return [], ''

try:

html = requests.get(onepageurl)

html.encoding = 'utf-8'

html = html.text

except Exception as e:

print(e)

pic_urls = []

fanye_url = ''

return pic_urls, fanye_url

pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)

fanye_urls = re.findall(re.compile(r'下一页'), html, flags=0)

fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''

return pic_urls, fanye_url

def down_pic(pic_urls):

"""给出图片链接列表, 下载所有图片"""

for i, pic_url in enumerate(pic_urls):

try:

pic = requests.get(pic_url, timeout=15)

string = str(i + 1) + '.jpg'

with open(string, 'wb') as f:

f.write(pic.content)

print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))

except Exception as e:

print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))

print(e)

continue

if __name__ == '__main__':

keyword = '火车票' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样

url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='

url_init = url_init_first + urllib.parse.quote(keyword, safe='/')

all_pic_urls = []

onepage_urls, fanye_url = get_onepage_urls(url_init)

all_pic_urls.extend(onepage_urls)

fanye_count = 0 # 累计翻页数

while 1:

onepage_urls, fanye_url = get_onepage_urls(fanye_url)

fanye_count += 1

# print('第页' % str(fanye_count))

if fanye_url == '' and onepage_urls == []:

break

all_pic_urls.extend(onepage_urls)

down_pic(list(set(all_pic_urls)))

链接:https://blog.csdn.net/xiligey1/article/details/73321152

2、根据搜索词下载谷歌、必应、百度图片

# coding:utf-8

# 基于icrawler第三方库同时爬取google,baidu,bing图片,并对名称进行重写,数据进行分类

# 图片存放路径为:base_dir='F:/文档/text'

import logging

import sys

import base64

from datetime import date

from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler

from icrawler import ImageDownloader

from icrawler.builtin import GoogleImageCrawler

from six.moves.urllib.parse import urlparse

class PrefixNameDownloader(ImageDownloader):

def get_filename(self, task, default_ext):

filename = super(PrefixNameDownloader, self).get_filename(

task, default_ext)

return 'prefix_' + filename

class Base64NameDownloader(ImageDownloader):

def get_filename(self, task, default_ext):

url_path = urlparse(task['file_url'])[2]

if '.' in url_path:

extension = url_path.split('.')[-1]

if extension.lower() not in [

'jpg', 'jpeg', 'png', 'bmp', 'tiff', 'gif', 'ppm', 'pgm'

]:

extension = default_ext

else:

extension = default_ext

filename = base64.b64encode(url_path.encode()).decode()

return '{}.{}'.format(filename, extension)

def test_google(dir,keyword):

print('启用google爬虫')

google_crawler = GoogleImageCrawler(parser_threads=20,

downloader_threads=20,

downloader_cls=Base64NameDownloader,

storage={'root_dir': dir},

log_level = logging.INFO)

google_crawler.crawl(keyword=keyword, offset=0, max_num=1000,min_size=(200,200), max_size=None)

def test_bing(dir,keyword):

keyword = keyword.replace(': flickr.com', '')

print('启用bing爬虫',keyword)

bing_crawler = BingImageCrawler(

# parser_threads=16,

downloader_cls=Base64NameDownloader,

downloader_threads=16,

storage={'root_dir': dir},

log_level=logging.DEBUG)

bing_crawler.crawl(keyword=keyword,offset=0, max_num=1000,min_size=None,max_size=None)

def test_baidu(dir,keyword):

keyword = keyword.replace(': flickr.com', '')

print('启用百度爬虫',keyword)

baidu_crawler = BaiduImageCrawler(

# parser_threads=16,

# downloader_threads=16,

downloader_cls=Base64NameDownloader,

storage={'root_dir': dir},

log_level = logging.DEBUG)

baidu_crawler.crawl(keyword=keyword, offset=0,max_num=1000,min_size=None,max_size=None)

def main():

##################################################################

keyword='火车票'

base_dir='F:/文档/text'

if len(sys.argv) == 1:

dst = 'all'

else:

dst = sys.argv[1:]

if 'all' in dst:

dst = ['google', 'bing', 'baidu',]

if 'google' in dst:

test_google(base_dir,keyword)

if 'bing' in dst:

test_bing(base_dir,keyword)

if 'baidu' in dst:

test_baidu(base_dir,keyword)

if __name__ == '__main__':

main()

链接:https://github.com/Crawler-y/Image_crawl-

3、github 搜索爬虫,有许多有趣的项目。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值