From e6c84bfb37a38c0ec6ddc20f478a623de01d9ea8 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 19 Oct 2021 14:25:48 +0000 Subject: [PATCH 01/95] add pipeline. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 10 ++ paddle/fluid/operators/data/dataloader_op.cc | 69 +++++++++ paddle/fluid/operators/data/dataloader_op.h | 40 +++++ paddle/fluid/operators/data/pipeline.cc | 143 ++++++++++++++++++ paddle/fluid/operators/data/pipeline.h | 124 +++++++++++++++ .../operators/data/unity_build_rule.cmake | 9 ++ python/paddle/fluid/dataloader/__init__.py | 6 +- python/paddle/fluid/dataloader/pipeline.py | 106 +++++++++++++ python/paddle/fluid/reader.py | 9 +- python/paddle/io/__init__.py | 2 + 10 files changed, 516 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/data/CMakeLists.txt create mode 100644 paddle/fluid/operators/data/dataloader_op.cc create mode 100644 paddle/fluid/operators/data/dataloader_op.h create mode 100644 paddle/fluid/operators/data/pipeline.cc create mode 100644 paddle/fluid/operators/data/pipeline.h create mode 100644 paddle/fluid/operators/data/unity_build_rule.cmake create mode 100755 python/paddle/fluid/dataloader/pipeline.py diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt new file mode 100644 index 00000000000000..350f1fd3d94b5b --- /dev/null +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -0,0 +1,10 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/data_ops. + include(unity_build_rule.cmake) +endif() + +register_operators() + +# TODO: add test here +# cc_test(xxx SRCS xxx DEPS xxx) diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc new file mode 100644 index 00000000000000..8eba9b6055b0a1 --- /dev/null +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/data/dataloader_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class DataLoaderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DataLoaderOp") + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + platform::CPUPlace()); + } +}; + +class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", + "(vector)" + "The output tensors of DataLoader operator, also the fetch " + "targets of the loaded program.") + .AsDuplicable(); + AddAttr("global_block", + "(BlockDesc *)" + "The global block of executed dataloader program " + "desc."); + AddAttr("start_op_index", + "(int64_t)" + "The index of the op to start execution"); + AddAttr("end_op_index", + "(int64_t)" + "The index of the op to stop execution"); + AddAttr( + "program_id", + "(int64_t)" + "The unique hash id used as cache key for ExecutorInfoCache."); + AddComment(R"DOC( + DataLoader Op + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(dataloader, ops::DataLoaderOp, ops::DataLoaderOpMaker); +REGISTER_OP_CPU_KERNEL(dataloader, ops::DataLoaderKernel); diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h new file mode 100644 index 00000000000000..6025642424225b --- /dev/null +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/reader/pipeline.h" + +namespace paddle { +namespace operators { + +class DataLoaderKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Step1: get output vars and attrs + auto output_vars = ctx.MultiOutputVar("Out"); + auto output_var_names = ctx.OutputNames("Out"); + + auto* global_block = ctx.Attr("global_block"); + auto start_op_index = ctx.Attr("start_op_index"); + auto end_op_index = ctx.Attr("end_op_index"); + auto program_id = ctx.Attr("program_id"); + + auto* pipeline = PipelineManager::Instance().GetPipeline( + program_id, *global_block, ctx.GetPlace(), + start_op_index, end_op_index, output_var_names); + + pipeline->ReadNext(output_vars); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc new file mode 100644 index 00000000000000..423946cbcdd295 --- /dev/null +++ b/paddle/fluid/operators/data/pipeline.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "ThreadPool.h" + +#include "paddle/fluid/operators/data/pipeline.h" + +namespace paddle { +namespace operators { +namespace data { + +Pipeline::Pipeline( + const BlockDesc &global_block, const platform::Place &place, + int64_t start_op_index, int64_t end_op_index, int64_t program_id, + const std::vector &output_var_names, + size_t prefetch_queue_size = 2) + : thread_pool_(1), + global_block_(global_block), + place_(place), + start_op_index_(start_op_index), + end_op_index_(end_op_index), + program_id_(program_id). + output_var_names_(output_var_names), + prefetch_queue_size_(prefetch_queue_size), + prefetch_queue_(prefetch_queue_size) { + VLOG(1) << "Pipeline init"; + + PADDLE_ENFORCE_GT(end_op_index_ > start_op_index_, + platform::errors::InvalidArgument( + "end_op_index should be greater than start_op_index, " + "but recieve %d <= %d.", end_op_index_, start_op_index_)) + + // Step1: prepare executor + auto *program = global_block_->Program(); + auto cache_info = framework::GetExecutorInfoFromCache( + *program, place_, start_op_index_, end_op_index_, + /*is_grad=*/false, program_id, &scope_); + auto ¶llel_executor = cache_info.first; + + // Step2: parset persistable variables + auto &skip_eager_delete_vars = + framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, /*is_grad=*/false); + if (cache_info.second /*is_new_created*/) { + // DataLoader program do not has input variables, not need to + // skip memory reuse for input variables here + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_var_names.begin(), + output_var_names.end()); + framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_var_names, &skip_eager_delete_vars); + } + + // Step3: start prefetch thread + StartPrefetchThread(parallel_executor, skip_eager_delete_vars) +} + +Pipeline::~Pipeline() { + VLOG(1) << "~Pipeline"; + +} + +Pipeline::StartPrefetchThreads(const ParallelExecutor &executor, + const std::vector &skip_vars) { + thread_pool_.enqueue([this, executor, skip_vars] -> void { + while (!closed_) { + // Step1: run ops by executor without fetch + executor->RunWithoutFetch(skip_eager_delete_vars); + + // Step2: fetch output variable to LoDTensor vector + framework::LoDTensorArray t_arr; + t_arr.resize(output_var_names_.size()); + for (size_t i = 0; i < output_var_names.size(); i++) { + auto *out_var = scope_.FindVar(output_var_names[i]); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "The output variable %s is not found in DataLoader " + "program's internal scope", output_var_names[i])); + CheckOutputVarStatus(*out_var, output_var_names[i]); + copy_tensor(*out_var, &t_arr[i]); + } + + // Step3: put LoDTensorArray to prefetch blocking_queue + prefetch_queue_.Push(t_arr); + } + }); +} + +void Pipeline::CheckOutputVarStatus( + const Variable &var, const std::string &var_name) { + // only LoDTensor variable type support currently + PADDLE_ENFORCE_EQ( + var.IsType(), true, + platform::errors::InvalidArgument( + "The output variable %s get from DataLoader program's " + "internal scope holds wrong type. Expect type is " + "LoDTensor, but receive type is %s.", var_name, + platform::demangle(framework::ToTypeName(var.Type())))); + PADDLE_ENFORCE_EQ( + var.Get().IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from DataLoader " + "program's internal scope is not initialized.", var_name)); +} + +void Pipeline::ReadNext(std::vector &out_vars) { + bool ok = true; + auto vars = prefetch_queue_.Pop(&ok); + PADDLE_ENFORCE_EQ(ok, true, platform::errors:Unavailable(" + Pop prefetch queue failed.")); + PADDLE_ENFORCE_EQ(out_vars.size(), vars.size(), + platform::errors::InvalidArgument( + "Output variable number to read should be variable number " + "read from prefetch queue, but recieved %d != %d", + out_vars.size(), output_var_names.size())); + + + for (size_t i = 0; i < vars.size(); i++) { + out_vars[i] = &vars[i]; + } +} + +bool Pipeline::Reset() { + // (TODO)Step1: reset dataset + // + // Step2: reopen pipeline + prefetch_queue_->Reopen(); + closed_ = false; +} + + +} // data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h new file mode 100644 index 00000000000000..33269a05b40479 --- /dev/null +++ b/paddle/fluid/operators/data/pipeline.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "ThreadPool.h" + +#include "paddle/fluid/operators/data/pipeline.h" + +namespace paddle { +namespace operators { + +using BlockDesc = framework::BlockDesc; +using Scope = framework::Scope; + +using LoDTensor = framework::LoDTensor; +using LoDTensorBlockingQueue = paddle::operators::reader::LoDTensorBlockingQueue + +namespace data { + +class Pipeline { + public: + Pipeline(const BlockDesc &global_block, int64_t start_op_index, + int64_t end_op_index, int64_t program_id, + const std::vector &output_var_names, + size_t prefetch_queue_size = 2); + + private: + inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } + + inline size_t PrefetchSize() { return prefetch_queue_.Size(); } + + inline void Pipeline::Close() { + VLOD(1) << "Pipeline close"; + prefetch_queue_.Close(); + closed_ = true; + } + + inline bool IsClosed() { return closed_; } + + bool Reset(); + + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; + auto &out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + void CheckOutputVarStatus(const Variable &var, const std::string &var_name); + + void ReadNext(std::vector &out_vars); + + std::shared_ptr global_block_; + Scope scope_; + int64_t start_op_index_; + int64_t end_op_index_; + int64_t program_id_; + + std::vector output_var_names_; + + platform::Place place_; + + ThreadPool thread_pool_; + const size_t prefetch_queue_size_; + const std::shared_ptr prefetch_queue_; + bool closed_{false}; + +}; + +class PipelineManager { + // PipelineManager is a signleton manager for Pipeline, we + // create single Pipeline for a program id + private: + DISABLE_COPY_AND_ASSIGN(PipelineManager); + + static PipelineManager* pm_instance_ptr_{nullptr}; + std::map prog_id_to_pipeline_; + + public: + static PipelineManager& Instance() { + if (pm_instance_ptr_ == nullptr) { + pm_instance_ptr_ = new PipelineManager(); + } + return *pm_instance_ptr_; + } + + std::shared_ptr GetPipeline( + int64_t program_id, const BlockDesc &global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, + const std::vector &output_var_names, + size_t prefetch_queue_size = 2) { + auto iter = prog_id_to_pipeline_.find(program_id); + if (iter != prog_id_to_pipeline_.end()) { + auto* pipeline = new Pipeline(global_block, place, + start_op_index, + end_op_index, + program_id, + output_var_names, + prefetch_queue_size); + prog_id_to_pipeline_.insert(std::pair(program_id, *pipeline)); + return std::make_shared(pipeline); + } else { + reutrn std::make_shared(&iter.second); + } + + } +}; + + +} // data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake new file mode 100644 index 00000000000000..8b4371facbfe6f --- /dev/null +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -0,0 +1,9 @@ +# This file records the Unity Build compilation rules. +# The source files in a `register_unity_group` called are compiled in a unity +# file. +# Generally, the combination rules in this file do not need to be modified. +# If there are some redefined error in compiling with the source file which +# in combination rule, you can remove the source file from the following rules. +register_unity_group(cc + dataloader_op.cc + pipeline.cc) diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/fluid/dataloader/__init__.py index 597f1f217483cc..9ecfff2f7dadc9 100644 --- a/python/paddle/fluid/dataloader/__init__.py +++ b/python/paddle/fluid/dataloader/__init__.py @@ -26,7 +26,11 @@ from . import sampler from .sampler import * +from . import pipeline +from .pipeline import * + __all__ = dataset.__all__ \ + batch_sampler.__all__ \ + dataloader_iter.__all__ \ - + sampler.__all__ + + sampler.__all__ \ + + pipeline.__all__ diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py new file mode 100755 index 00000000000000..9a18bb5427c5c8 --- /dev/null +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -0,0 +1,106 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid + +from paddle import _C_ops +from paddle.fluid import core, framework +from paddle.fluid.layers.utils import _hash_with_id + +from collections.abc import Sequence, Mapping + +__all__ = ["Pipeline"] + + +class Pipeline: + """ + Data pipeline + + Args: + queue_depth(int): queue depth for caching data between OPs + """ + + def __init__(self, queue_depth=2): + assert isinstance(queue_depth, int), \ + "queue_depth should be an integer" + self._queue_depth = queue_depth + + def _init_programs(self): + self._main_program = fluid.Program() + self._startup_program = fluid.Program() + self._run_program = fluid.Program() + self._out_vars = [] + self._out_names = [] + self._is_built = False + + def __enter__(self): + # switch main and startup program + self._main_program = fluid.switch_main_program(self._main_program) + self._startup_program = fluid.switch_startup_program(self._startup_program) + return self + + def __exit__(self): + self._main_program = fluid.switch_main_program(self._main_program) + self._startup_program = fluid.switch_startup_program(self._startup_program) + + def set_outputs(self, outputs): + if isinstance(outputs, Sequence): + for var in outputs: + self._out_vars.append(output) + elif isinstance(outputs, Mapping): + for name, var in outputs.items(): + self._out_vars.append(var) + self._out_names.append(name) + else: + assert isinstance(outputs, fluid.Variable), \ + "outputs should be list, dict or Variable" + + def build(self): + self._output_vars = self._prepare_output_vars() + global_block = self._main_program.desc.block(0) + program_id = _hash_with_id(self._main_program, self) + + self._attrs = ('global_block', global_block, 'start_op_index', 0, + 'end_op_index', global_block.op_size(), + 'program_id', program_id) + + self._is_built = True + + def _prepare_output_vars(self): + output_vars = [] + for var in self._out_vars: + assert isinstance(var, framework.Variable), \ + "output of DataLoader program should be Variable" + var_desc = var.desc + output_var = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), + var_desc.type(), False) + output_vars.append(output_var) + + return output_vars + + def __next__(self): + assert self._is_built, \ + "Pipeline not built, please call build() firstly" + _C_ops.dataloader(self._output_vars, *self._attrs) + return {k: v for k, v in zip(self._output_vars, self._out_names)} + + # Python 2 compatable + def next(self): + return self.__next__() + diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index dfc887292e7cff..bdbf75acfff625 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -22,7 +22,7 @@ from .executor import global_scope from .data_feeder import DataFeeder, BatchedTensorProvider from .multiprocess_utils import multiprocess_queue_set, CleanupFuncRegistrar, _cleanup_mmap, _cleanup, _set_SIGCHLD_handler -from .dataloader import BatchSampler, Dataset, IterableDataset +from .dataloader import BatchSampler, Dataset, IterableDataset, Pipeline from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, _DatasetKind, default_collate_fn from .dataloader.batch_sampler import _InfiniteIterableSampler from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer @@ -436,6 +436,13 @@ def __iter__(self): def __call__(self): return self.__iter__() + @staticmethod + def from_pipeline(pipeline): + assert isinstance(pipeline, Pipeline), \ + "pipeline should be an instance of paddle.io.Pipeline" + pipeline.build() + return pipeline + @staticmethod def from_generator(feed_list=None, capacity=None, diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index 5781f78c6e4e4a..b267b09925d4c0 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -15,6 +15,7 @@ # TODO: define all functions about input & output in this directory from ..fluid.io import DataLoader # noqa: F401 +from ..fluid.dataloader import Pipeline # noqa: F401 from ..fluid.dataloader import Dataset # noqa: F401 from ..fluid.dataloader import IterableDataset # noqa: F401 from ..fluid.dataloader import BatchSampler # noqa: F401 @@ -40,6 +41,7 @@ 'DistributedBatchSampler', 'DataLoader', 'get_worker_info', + 'Pipeline', 'Sampler', 'SequenceSampler', 'RandomSampler', From 8f19ea29850793159632354ba5c78d5611c3ba1f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 19 Oct 2021 14:55:00 +0000 Subject: [PATCH 02/95] fixing compile error. --- paddle/fluid/operators/CMakeLists.txt | 1 + paddle/fluid/operators/data/dataloader_op.cc | 5 +- paddle/fluid/operators/data/dataloader_op.h | 10 ++- paddle/fluid/operators/data/pipeline.cc | 19 +++++- paddle/fluid/operators/data/pipeline.h | 64 +++++++++++--------- python/paddle/fluid/dataloader/pipeline.py | 1 - 6 files changed, 63 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b910b4ec73901b..41cbc3346dc38f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -18,6 +18,7 @@ add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(jit) +add_subdirectory(data) if(WITH_MKLDNN) add_subdirectory(mkldnn) endif() diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index 8eba9b6055b0a1..b7af0b41d3a895 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -21,8 +21,9 @@ using framework::Tensor; class DataLoaderOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DataLoaderOp") + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DataLoaderOp"); } protected: @@ -66,4 +67,4 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(dataloader, ops::DataLoaderOp, ops::DataLoaderOpMaker); -REGISTER_OP_CPU_KERNEL(dataloader, ops::DataLoaderKernel); +REGISTER_OP_CPU_KERNEL(dataloader, ops::DataLoaderOpKernel); diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 6025642424225b..122e750c25ba08 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -11,12 +11,16 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/reader/pipeline.h" +#include "paddle/fluid/operators/data/pipeline.h" namespace paddle { namespace operators { -class DataLoaderKernel : public framework::OpKernel { +using Pipeline= data::Pipeline; +using PipelineManager = data::PipelineManager; + +template +class DataLoaderOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { // Step1: get output vars and attrs @@ -28,7 +32,7 @@ class DataLoaderKernel : public framework::OpKernel { auto end_op_index = ctx.Attr("end_op_index"); auto program_id = ctx.Attr("program_id"); - auto* pipeline = PipelineManager::Instance().GetPipeline( + auto pipeline = PipelineManager::Instance().GetPipeline( program_id, *global_block, ctx.GetPlace(), start_op_index, end_op_index, output_var_names); diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 423946cbcdd295..e91c829497488d 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -24,6 +24,7 @@ Pipeline::Pipeline( const std::vector &output_var_names, size_t prefetch_queue_size = 2) : thread_pool_(1), + closed_(false), global_block_(global_block), place_(place), start_op_index_(start_op_index), @@ -69,7 +70,7 @@ Pipeline::~Pipeline() { } -Pipeline::StartPrefetchThreads(const ParallelExecutor &executor, +void Pipeline::StartPrefetchThread(const ParallelExecutor &executor, const std::vector &skip_vars) { thread_pool_.enqueue([this, executor, skip_vars] -> void { while (!closed_) { @@ -89,6 +90,11 @@ Pipeline::StartPrefetchThreads(const ParallelExecutor &executor, copy_tensor(*out_var, &t_arr[i]); } + // TODO: dataset drain check + // if dataset drained: + // closed_.store(true) + // break + // Step3: put LoDTensorArray to prefetch blocking_queue prefetch_queue_.Push(t_arr); } @@ -129,12 +135,19 @@ void Pipeline::ReadNext(std::vector &out_vars) { } } -bool Pipeline::Reset() { +inline void Pipeline::Close() { + VLOD(1) << "Pipeline close"; + prefetch_queue_.Close(); + closed_ = true; +} + +inline void Pipeline::Reset() { // (TODO)Step1: reset dataset // // Step2: reopen pipeline prefetch_queue_->Reopen(); - closed_ = false; + closed_.store(false); + StartPrefetchThread(); } diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 33269a05b40479..eaf72e6d6d15a6 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -14,41 +14,42 @@ #include #include "ThreadPool.h" -#include "paddle/fluid/operators/data/pipeline.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { using BlockDesc = framework::BlockDesc; using Scope = framework::Scope; +using ParallelExecutor = framework::ParallelExecutor; +using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; -using LoDTensorBlockingQueue = paddle::operators::reader::LoDTensorBlockingQueue +using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; namespace data { class Pipeline { public: - Pipeline(const BlockDesc &global_block, int64_t start_op_index, - int64_t end_op_index, int64_t program_id, + Pipeline(const BlockDesc &global_block, const platform::Place &place, + int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &output_var_names, size_t prefetch_queue_size = 2); - private: inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } inline size_t PrefetchSize() { return prefetch_queue_.Size(); } - inline void Pipeline::Close() { - VLOD(1) << "Pipeline close"; - prefetch_queue_.Close(); - closed_ = true; - } - inline bool IsClosed() { return closed_; } - bool Reset(); + inline void Close(); + + inline void Reset(); + + void ReadNext(std::vector &out_vars); + private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { if (lod_tensor.numel() == 0) return; @@ -57,9 +58,9 @@ class Pipeline { out_tensor.set_lod(lod_tensor.lod()); } - void CheckOutputVarStatus(const Variable &var, const std::string &var_name); + void StartPrefetchThread(const ParallelExecutor &executor,const std::vector &skip_vars); - void ReadNext(std::vector &out_vars); + void CheckOutputVarStatus(const Variable &var, const std::string &var_name); std::shared_ptr global_block_; Scope scope_; @@ -73,8 +74,8 @@ class Pipeline { ThreadPool thread_pool_; const size_t prefetch_queue_size_; - const std::shared_ptr prefetch_queue_; - bool closed_{false}; + LoDTensorBlockingQueue prefetch_queue_; + std::atomic closed_; }; @@ -84,13 +85,13 @@ class PipelineManager { private: DISABLE_COPY_AND_ASSIGN(PipelineManager); - static PipelineManager* pm_instance_ptr_{nullptr}; - std::map prog_id_to_pipeline_; + static std::shared_ptr pm_instance_ptr_; + std::map> prog_id_to_pipeline_; public: static PipelineManager& Instance() { if (pm_instance_ptr_ == nullptr) { - pm_instance_ptr_ = new PipelineManager(); + pm_instance_ptr_ = std::shared_ptr(new PipelineManager); } return *pm_instance_ptr_; } @@ -103,18 +104,25 @@ class PipelineManager { size_t prefetch_queue_size = 2) { auto iter = prog_id_to_pipeline_.find(program_id); if (iter != prog_id_to_pipeline_.end()) { - auto* pipeline = new Pipeline(global_block, place, - start_op_index, - end_op_index, - program_id, - output_var_names, - prefetch_queue_size); - prog_id_to_pipeline_.insert(std::pair(program_id, *pipeline)); - return std::make_shared(pipeline); + prog_id_to_pipeline_[program_id] = \ + std::shared_ptr(new Pipeline( + global_block, place, + start_op_index, + end_op_index, + program_id, + output_var_names, + prefetch_queue_size)); + return prog_id_to_pipeline_[program_id]; } else { - reutrn std::make_shared(&iter.second); + return iter->second; } + } + + PipelineManager() { VLOG(1) << "PipelineManager init"; } + ~PipelineManager() { + VLOG(1) << "~PipelineManager"; + prog_id_to_pipeline_.clear(); } }; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 9a18bb5427c5c8..ec9178ce1f4e51 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -42,7 +42,6 @@ def __init__(self, queue_depth=2): def _init_programs(self): self._main_program = fluid.Program() self._startup_program = fluid.Program() - self._run_program = fluid.Program() self._out_vars = [] self._out_names = [] self._is_built = False From 928e2cea30e9f6b0425cf57fe194e3ce9244a880 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 20 Oct 2021 14:37:42 +0000 Subject: [PATCH 03/95] compile success --- paddle/fluid/operators/data/CMakeLists.txt | 7 ++- paddle/fluid/operators/data/dataloader_op.cc | 12 ++-- paddle/fluid/operators/data/dataloader_op.h | 11 ++-- paddle/fluid/operators/data/pipeline.cc | 60 +++++++++---------- paddle/fluid/operators/data/pipeline.h | 51 +++++++++------- .../operators/data/unity_build_rule.cmake | 4 +- 6 files changed, 79 insertions(+), 66 deletions(-) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 350f1fd3d94b5b..76700a77b403d9 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -1,10 +1,13 @@ include(operators) if(WITH_UNITY_BUILD) - # Load Unity Build rules for operators in paddle/fluid/operators/data_ops. + # Load Unity Build rules for operators in paddle/fluid/operators/data/ include(unity_build_rule.cmake) endif() -register_operators() +cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool) +op_library(dataloader_op SRCS dataloader_op.cc DEPS pipeline) + +# register_operators() # TODO: add test here # cc_test(xxx SRCS xxx DEPS xxx) diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index b7af0b41d3a895..f199d80f4c7c94 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -52,10 +52,14 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("end_op_index", "(int64_t)" "The index of the op to stop execution"); - AddAttr( - "program_id", - "(int64_t)" - "The unique hash id used as cache key for ExecutorInfoCache."); + AddAttr("program_id", + "(int64_t)" + "The unique hash id used as cache key for " + "ExecutorInfoCache"); + AddAttr("prefetch_depth", + "(size_t)" + "The prefetch batch number") + .SetDefault(2); AddComment(R"DOC( DataLoader Op )DOC"); diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 122e750c25ba08..8c8b0fa616a916 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -16,8 +16,7 @@ namespace paddle { namespace operators { -using Pipeline= data::Pipeline; -using PipelineManager = data::PipelineManager; +class Pipeline; template class DataLoaderOpKernel : public framework::OpKernel { @@ -31,10 +30,12 @@ class DataLoaderOpKernel : public framework::OpKernel { auto start_op_index = ctx.Attr("start_op_index"); auto end_op_index = ctx.Attr("end_op_index"); auto program_id = ctx.Attr("program_id"); + auto prefetch_depth = static_cast(ctx.Attr("prefetch_depth")); - auto pipeline = PipelineManager::Instance().GetPipeline( - program_id, *global_block, ctx.GetPlace(), - start_op_index, end_op_index, output_var_names); + auto pipeline = data::PipelineManager::Instance()->GetPipeline( + program_id, global_block, ctx.GetPlace(), + start_op_index, end_op_index, output_var_names, + prefetch_depth); pipeline->ReadNext(output_vars); } diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index e91c829497488d..cbc619e7164a6e 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -9,37 +9,35 @@ See the License for the specific language governing permissions and limitations under the License. */ -#pragma once -#include "ThreadPool.h" - #include "paddle/fluid/operators/data/pipeline.h" +#include "paddle/fluid/framework/executor_cache.h" namespace paddle { namespace operators { namespace data { Pipeline::Pipeline( - const BlockDesc &global_block, const platform::Place &place, + const std::shared_ptr global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &output_var_names, - size_t prefetch_queue_size = 2) + size_t prefetch_queue_size) : thread_pool_(1), closed_(false), global_block_(global_block), place_(place), start_op_index_(start_op_index), end_op_index_(end_op_index), - program_id_(program_id). + program_id_(program_id), output_var_names_(output_var_names), prefetch_queue_size_(prefetch_queue_size), prefetch_queue_(prefetch_queue_size) { VLOG(1) << "Pipeline init"; - PADDLE_ENFORCE_GT(end_op_index_ > start_op_index_, + PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, platform::errors::InvalidArgument( "end_op_index should be greater than start_op_index, " - "but recieve %d <= %d.", end_op_index_, start_op_index_)) - + "but recieve %d <= %d.", end_op_index_, start_op_index_)); + // Step1: prepare executor auto *program = global_block_->Program(); auto cache_info = framework::GetExecutorInfoFromCache( @@ -62,32 +60,28 @@ Pipeline::Pipeline( } // Step3: start prefetch thread - StartPrefetchThread(parallel_executor, skip_eager_delete_vars) -} - -Pipeline::~Pipeline() { - VLOG(1) << "~Pipeline"; - + StartPrefetchThread(parallel_executor, skip_eager_delete_vars); } -void Pipeline::StartPrefetchThread(const ParallelExecutor &executor, +void Pipeline::StartPrefetchThread( + std::shared_ptr executor, const std::vector &skip_vars) { - thread_pool_.enqueue([this, executor, skip_vars] -> void { - while (!closed_) { + thread_pool_.enqueue([this, executor, skip_vars]() -> void { + while (!closed_.load()) { // Step1: run ops by executor without fetch - executor->RunWithoutFetch(skip_eager_delete_vars); + executor->RunWithoutFetch(skip_vars); // Step2: fetch output variable to LoDTensor vector framework::LoDTensorArray t_arr; t_arr.resize(output_var_names_.size()); - for (size_t i = 0; i < output_var_names.size(); i++) { - auto *out_var = scope_.FindVar(output_var_names[i]); + for (size_t i = 0; i < output_var_names_.size(); i++) { + auto *out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( "The output variable %s is not found in DataLoader " - "program's internal scope", output_var_names[i])); - CheckOutputVarStatus(*out_var, output_var_names[i]); - copy_tensor(*out_var, &t_arr[i]); + "program's internal scope", output_var_names_[i])); + CheckOutputVarStatus(*out_var, output_var_names_[i]); + copy_tensor(out_var->Get(), &t_arr[i]); } // TODO: dataset drain check @@ -121,22 +115,23 @@ void Pipeline::CheckOutputVarStatus( void Pipeline::ReadNext(std::vector &out_vars) { bool ok = true; auto vars = prefetch_queue_.Pop(&ok); - PADDLE_ENFORCE_EQ(ok, true, platform::errors:Unavailable(" - Pop prefetch queue failed.")); + PADDLE_ENFORCE_EQ(ok, true, platform::errors::Unavailable( + "Pop prefetch queue failed.")); PADDLE_ENFORCE_EQ(out_vars.size(), vars.size(), platform::errors::InvalidArgument( "Output variable number to read should be variable number " "read from prefetch queue, but recieved %d != %d", - out_vars.size(), output_var_names.size())); + out_vars.size(), output_var_names_.size())); for (size_t i = 0; i < vars.size(); i++) { - out_vars[i] = &vars[i]; + // out_vars[i] = &vars[i]; + copy_tensor(vars[i], out_vars[i]->GetMutable()); } } inline void Pipeline::Close() { - VLOD(1) << "Pipeline close"; + VLOG(1) << "Pipeline close"; prefetch_queue_.Close(); closed_ = true; } @@ -145,11 +140,14 @@ inline void Pipeline::Reset() { // (TODO)Step1: reset dataset // // Step2: reopen pipeline - prefetch_queue_->Reopen(); + prefetch_queue_.ReOpen(); closed_.store(false); - StartPrefetchThread(); + // StartPrefetchThread(); } +// initialization static variables out of PipelineManager +PipelineManager* PipelineManager::pm_instance_ptr_ = nullptr; +std::mutex PipelineManager::m_; } // data } // namespace operators diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index eaf72e6d6d15a6..9406af8ef50608 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -11,6 +11,8 @@ #pragma once #include +#include +#include #include #include "ThreadPool.h" @@ -32,10 +34,15 @@ namespace data { class Pipeline { public: - Pipeline(const BlockDesc &global_block, const platform::Place &place, + Pipeline(const std::shared_ptr global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &output_var_names, - size_t prefetch_queue_size = 2); + size_t prefetch_queue_size); + + // ~Pipeline() { + // VLOG(1) << "~Pipeline"; + // Close(); + // } inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } @@ -58,25 +65,25 @@ class Pipeline { out_tensor.set_lod(lod_tensor.lod()); } - void StartPrefetchThread(const ParallelExecutor &executor,const std::vector &skip_vars); + void StartPrefetchThread(std::shared_ptr executor, + const std::vector &skip_vars); void CheckOutputVarStatus(const Variable &var, const std::string &var_name); - std::shared_ptr global_block_; + ThreadPool thread_pool_; + std::atomic closed_; + Scope scope_; + std::shared_ptr global_block_; + platform::Place place_; int64_t start_op_index_; int64_t end_op_index_; int64_t program_id_; std::vector output_var_names_; - platform::Place place_; - - ThreadPool thread_pool_; const size_t prefetch_queue_size_; LoDTensorBlockingQueue prefetch_queue_; - std::atomic closed_; - }; class PipelineManager { @@ -85,33 +92,34 @@ class PipelineManager { private: DISABLE_COPY_AND_ASSIGN(PipelineManager); - static std::shared_ptr pm_instance_ptr_; + static PipelineManager* pm_instance_ptr_; std::map> prog_id_to_pipeline_; + static std::mutex m_; public: - static PipelineManager& Instance() { + static PipelineManager* Instance() { if (pm_instance_ptr_ == nullptr) { - pm_instance_ptr_ = std::shared_ptr(new PipelineManager); + std::lock_guard lk(m_); + if (pm_instance_ptr_ == nullptr) { + pm_instance_ptr_ = new PipelineManager; + } } - return *pm_instance_ptr_; + return pm_instance_ptr_; } std::shared_ptr GetPipeline( - int64_t program_id, const BlockDesc &global_block, + int64_t program_id, BlockDesc* global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, const std::vector &output_var_names, - size_t prefetch_queue_size = 2) { + size_t prefetch_queue_size) { auto iter = prog_id_to_pipeline_.find(program_id); if (iter != prog_id_to_pipeline_.end()) { prog_id_to_pipeline_[program_id] = \ std::shared_ptr(new Pipeline( - global_block, place, - start_op_index, - end_op_index, - program_id, - output_var_names, - prefetch_queue_size)); + std::shared_ptr(global_block), place, + start_op_index, end_op_index, program_id, + output_var_names, prefetch_queue_size)); return prog_id_to_pipeline_[program_id]; } else { return iter->second; @@ -126,7 +134,6 @@ class PipelineManager { } }; - } // data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index 8b4371facbfe6f..b7bab025785268 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -5,5 +5,5 @@ # If there are some redefined error in compiling with the source file which # in combination rule, you can remove the source file from the following rules. register_unity_group(cc - dataloader_op.cc - pipeline.cc) + pipeline.cc + dataloader_op.cc) From 7cdecdd24d114e1a7b568ff591e0f9c770007900 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 24 Oct 2021 13:29:58 +0000 Subject: [PATCH 04/95] run success --- paddle/fluid/operators/data/CMakeLists.txt | 2 +- paddle/fluid/operators/data/dataloader_op.cc | 16 +- .../fluid/operators/data/dataloader_op.cu.cc | 20 +++ paddle/fluid/operators/data/dataloader_op.h | 38 ++--- paddle/fluid/operators/data/pipeline.cc | 90 +++++------ paddle/fluid/operators/data/pipeline.h | 142 +++++++++--------- .../operators/data/unity_build_rule.cmake | 3 + paddle/fluid/pybind/op_function_generator.cc | 1 + python/paddle/fluid/dataloader/pipeline.py | 36 +++-- 9 files changed, 194 insertions(+), 154 deletions(-) create mode 100644 paddle/fluid/operators/data/dataloader_op.cu.cc diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 76700a77b403d9..cc6ee7c8e9b63f 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -5,7 +5,7 @@ if(WITH_UNITY_BUILD) endif() cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool) -op_library(dataloader_op SRCS dataloader_op.cc DEPS pipeline) +op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index f199d80f4c7c94..52eddb25c79f73 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -23,14 +23,20 @@ class DataLoaderOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DataLoaderOp"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "DataLoaderOp"); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType(framework::proto::VarType::FP32, - platform::CPUPlace()); + ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return expected_kernel_type; } }; @@ -57,7 +63,7 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { "The unique hash id used as cache key for " "ExecutorInfoCache"); AddAttr("prefetch_depth", - "(size_t)" + "(int64_t)" "The prefetch batch number") .SetDefault(2); AddComment(R"DOC( @@ -71,4 +77,6 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(dataloader, ops::DataLoaderOp, ops::DataLoaderOpMaker); -REGISTER_OP_CPU_KERNEL(dataloader, ops::DataLoaderOpKernel); +REGISTER_OP_CPU_KERNEL( + dataloader, + ops::DataLoaderOpKernel); diff --git a/paddle/fluid/operators/data/dataloader_op.cu.cc b/paddle/fluid/operators/data/dataloader_op.cu.cc new file mode 100644 index 00000000000000..52dea24815fe1f --- /dev/null +++ b/paddle/fluid/operators/data/dataloader_op.cu.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/data/dataloader_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + dataloader, + ops::DataLoaderOpKernel); diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 8c8b0fa616a916..d2d52d9150969a 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -20,25 +20,25 @@ class Pipeline; template class DataLoaderOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // Step1: get output vars and attrs - auto output_vars = ctx.MultiOutputVar("Out"); - auto output_var_names = ctx.OutputNames("Out"); - - auto* global_block = ctx.Attr("global_block"); - auto start_op_index = ctx.Attr("start_op_index"); - auto end_op_index = ctx.Attr("end_op_index"); - auto program_id = ctx.Attr("program_id"); - auto prefetch_depth = static_cast(ctx.Attr("prefetch_depth")); - - auto pipeline = data::PipelineManager::Instance()->GetPipeline( - program_id, global_block, ctx.GetPlace(), - start_op_index, end_op_index, output_var_names, - prefetch_depth); - - pipeline->ReadNext(output_vars); - } + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Step1: get output vars and attrs + auto output_vars = ctx.MultiOutputVar("Out"); + auto output_var_names = ctx.OutputNames("Out"); + + auto* global_block = ctx.Attr("global_block"); + auto start_op_index = ctx.Attr("start_op_index"); + auto end_op_index = ctx.Attr("end_op_index"); + auto program_id = ctx.Attr("program_id"); + auto prefetch_depth = + static_cast(ctx.Attr("prefetch_depth")); + + auto pipeline = data::PipelineManager::Instance()->GetPipeline( + program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, + output_var_names, prefetch_depth); + + pipeline->ReadNext(output_vars); + } }; } // namespace operators diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index cbc619e7164a6e..69a81133395518 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -16,11 +16,11 @@ namespace paddle { namespace operators { namespace data { -Pipeline::Pipeline( - const std::shared_ptr global_block, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, int64_t program_id, - const std::vector &output_var_names, - size_t prefetch_queue_size) +Pipeline::Pipeline(const std::shared_ptr global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, int64_t program_id, + const std::vector &output_var_names, + size_t prefetch_queue_size) : thread_pool_(1), closed_(false), global_block_(global_block), @@ -33,10 +33,11 @@ Pipeline::Pipeline( prefetch_queue_(prefetch_queue_size) { VLOG(1) << "Pipeline init"; - PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, - platform::errors::InvalidArgument( - "end_op_index should be greater than start_op_index, " - "but recieve %d <= %d.", end_op_index_, start_op_index_)); + PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, + platform::errors::InvalidArgument( + "end_op_index should be greater than start_op_index, " + "but recieve %d <= %d.", + end_op_index_, start_op_index_)); // Step1: prepare executor auto *program = global_block_->Program(); @@ -44,17 +45,17 @@ Pipeline::Pipeline( *program, place_, start_op_index_, end_op_index_, /*is_grad=*/false, program_id, &scope_); auto ¶llel_executor = cache_info.first; - + // Step2: parset persistable variables auto &skip_eager_delete_vars = - framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, /*is_grad=*/false); + framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, /*is_grad=*/false); if (cache_info.second /*is_new_created*/) { // DataLoader program do not has input variables, not need to // skip memory reuse for input variables here skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - output_var_names.begin(), - output_var_names.end()); + output_var_names.begin(), + output_var_names.end()); framework::details::ParseSafeEagerDeletionSkipVars( *program, end_op_index, output_var_names, &skip_eager_delete_vars); } @@ -63,9 +64,8 @@ Pipeline::Pipeline( StartPrefetchThread(parallel_executor, skip_eager_delete_vars); } -void Pipeline::StartPrefetchThread( - std::shared_ptr executor, - const std::vector &skip_vars) { +void Pipeline::StartPrefetchThread(std::shared_ptr executor, + const std::vector &skip_vars) { thread_pool_.enqueue([this, executor, skip_vars]() -> void { while (!closed_.load()) { // Step1: run ops by executor without fetch @@ -78,8 +78,9 @@ void Pipeline::StartPrefetchThread( auto *out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( - "The output variable %s is not found in DataLoader " - "program's internal scope", output_var_names_[i])); + "The output variable %s is not found in DataLoader " + "program's internal scope", + output_var_names_[i])); CheckOutputVarStatus(*out_var, output_var_names_[i]); copy_tensor(out_var->Get(), &t_arr[i]); } @@ -95,39 +96,38 @@ void Pipeline::StartPrefetchThread( }); } -void Pipeline::CheckOutputVarStatus( - const Variable &var, const std::string &var_name) { +void Pipeline::CheckOutputVarStatus(const Variable &var, + const std::string &var_name) { // only LoDTensor variable type support currently PADDLE_ENFORCE_EQ( var.IsType(), true, platform::errors::InvalidArgument( "The output variable %s get from DataLoader program's " "internal scope holds wrong type. Expect type is " - "LoDTensor, but receive type is %s.", var_name, - platform::demangle(framework::ToTypeName(var.Type())))); - PADDLE_ENFORCE_EQ( - var.Get().IsInitialized(), true, - platform::errors::InvalidArgument( - "The tensor in output variable %s get from DataLoader " - "program's internal scope is not initialized.", var_name)); + "LoDTensor, but receive type is %s.", + var_name, platform::demangle(framework::ToTypeName(var.Type())))); + PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from DataLoader " + "program's internal scope is not initialized.", + var_name)); } void Pipeline::ReadNext(std::vector &out_vars) { - bool ok = true; - auto vars = prefetch_queue_.Pop(&ok); - PADDLE_ENFORCE_EQ(ok, true, platform::errors::Unavailable( - "Pop prefetch queue failed.")); - PADDLE_ENFORCE_EQ(out_vars.size(), vars.size(), - platform::errors::InvalidArgument( - "Output variable number to read should be variable number " - "read from prefetch queue, but recieved %d != %d", - out_vars.size(), output_var_names_.size())); - - - for (size_t i = 0; i < vars.size(); i++) { - // out_vars[i] = &vars[i]; - copy_tensor(vars[i], out_vars[i]->GetMutable()); - } + bool ok = true; + auto vars = prefetch_queue_.Pop(&ok); + PADDLE_ENFORCE_EQ( + ok, true, platform::errors::Unavailable("Pop prefetch queue failed.")); + PADDLE_ENFORCE_EQ( + out_vars.size(), vars.size(), + platform::errors::InvalidArgument( + "Output variable number to read should be variable number " + "read from prefetch queue, but recieved %d != %d", + out_vars.size(), output_var_names_.size())); + + for (size_t i = 0; i < vars.size(); i++) { + copy_tensor(vars[i], out_vars[i]->GetMutable()); + } } inline void Pipeline::Close() { @@ -145,8 +145,8 @@ inline void Pipeline::Reset() { // StartPrefetchThread(); } -// initialization static variables out of PipelineManager -PipelineManager* PipelineManager::pm_instance_ptr_ = nullptr; +// initialization static variables out of PipelineManager +PipelineManager *PipelineManager::pm_instance_ptr_ = nullptr; std::mutex PipelineManager::m_; } // data diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 9406af8ef50608..247dcbef30a6f1 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -33,105 +33,103 @@ using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; namespace data { class Pipeline { - public: - Pipeline(const std::shared_ptr global_block, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, int64_t program_id, - const std::vector &output_var_names, - size_t prefetch_queue_size); + public: + Pipeline(const std::shared_ptr global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, int64_t program_id, + const std::vector &output_var_names, + size_t prefetch_queue_size); - // ~Pipeline() { - // VLOG(1) << "~Pipeline"; - // Close(); - // } + // ~Pipeline() { + // VLOG(1) << "~Pipeline"; + // Close(); + // } - inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } + inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } - inline size_t PrefetchSize() { return prefetch_queue_.Size(); } + inline size_t PrefetchSize() { return prefetch_queue_.Size(); } - inline bool IsClosed() { return closed_; } + inline bool IsClosed() { return closed_; } - inline void Close(); + inline void Close(); - inline void Reset(); + inline void Reset(); - void ReadNext(std::vector &out_vars); + void ReadNext(std::vector &out_vars); - private: - void copy_tensor(const framework::LoDTensor &lod_tensor, - framework::LoDTensor *out) const { - if (lod_tensor.numel() == 0) return; - auto &out_tensor = *out; - TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); - out_tensor.set_lod(lod_tensor.lod()); - } + private: + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; + auto &out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } - void StartPrefetchThread(std::shared_ptr executor, - const std::vector &skip_vars); + void StartPrefetchThread(std::shared_ptr executor, + const std::vector &skip_vars); - void CheckOutputVarStatus(const Variable &var, const std::string &var_name); + void CheckOutputVarStatus(const Variable &var, const std::string &var_name); - ThreadPool thread_pool_; - std::atomic closed_; + ThreadPool thread_pool_; + std::atomic closed_; - Scope scope_; - std::shared_ptr global_block_; - platform::Place place_; - int64_t start_op_index_; - int64_t end_op_index_; - int64_t program_id_; + Scope scope_; + std::shared_ptr global_block_; + platform::Place place_; + int64_t start_op_index_; + int64_t end_op_index_; + int64_t program_id_; - std::vector output_var_names_; + std::vector output_var_names_; - const size_t prefetch_queue_size_; - LoDTensorBlockingQueue prefetch_queue_; + const size_t prefetch_queue_size_; + LoDTensorBlockingQueue prefetch_queue_; }; class PipelineManager { // PipelineManager is a signleton manager for Pipeline, we // create single Pipeline for a program id - private: - DISABLE_COPY_AND_ASSIGN(PipelineManager); + private: + DISABLE_COPY_AND_ASSIGN(PipelineManager); - static PipelineManager* pm_instance_ptr_; - std::map> prog_id_to_pipeline_; - static std::mutex m_; + static PipelineManager *pm_instance_ptr_; + std::map> prog_id_to_pipeline_; + static std::mutex m_; - public: - static PipelineManager* Instance() { + public: + static PipelineManager *Instance() { + if (pm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); if (pm_instance_ptr_ == nullptr) { - std::lock_guard lk(m_); - if (pm_instance_ptr_ == nullptr) { - pm_instance_ptr_ = new PipelineManager; - } + pm_instance_ptr_ = new PipelineManager; } - return pm_instance_ptr_; } - - std::shared_ptr GetPipeline( - int64_t program_id, BlockDesc* global_block, - const platform::Place &place, int64_t start_op_index, - int64_t end_op_index, - const std::vector &output_var_names, - size_t prefetch_queue_size) { - auto iter = prog_id_to_pipeline_.find(program_id); - if (iter != prog_id_to_pipeline_.end()) { - prog_id_to_pipeline_[program_id] = \ - std::shared_ptr(new Pipeline( - std::shared_ptr(global_block), place, - start_op_index, end_op_index, program_id, - output_var_names, prefetch_queue_size)); - return prog_id_to_pipeline_[program_id]; - } else { - return iter->second; - } + return pm_instance_ptr_; + } + + std::shared_ptr GetPipeline( + int64_t program_id, BlockDesc *global_block, const platform::Place &place, + int64_t start_op_index, int64_t end_op_index, + const std::vector &output_var_names, + size_t prefetch_queue_size) { + auto iter = prog_id_to_pipeline_.find(program_id); + if (iter == prog_id_to_pipeline_.end()) { + prog_id_to_pipeline_[program_id] = std::shared_ptr(new Pipeline( + std::shared_ptr(global_block), place, start_op_index, + end_op_index, program_id, output_var_names, prefetch_queue_size)); + return prog_id_to_pipeline_[program_id]; + } else { + return iter->second; } + } - PipelineManager() { VLOG(1) << "PipelineManager init"; } + PipelineManager() { VLOG(1) << "PipelineManager init"; } - ~PipelineManager() { - VLOG(1) << "~PipelineManager"; - prog_id_to_pipeline_.clear(); - } + ~PipelineManager() { + VLOG(1) << "~PipelineManager"; + prog_id_to_pipeline_.clear(); + } }; } // data diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index b7bab025785268..1fe36081c76c8a 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -7,3 +7,6 @@ register_unity_group(cc pipeline.cc dataloader_op.cc) + +register_unity_group(cu + dataloader_op.cu.cc) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 01d101909b549b..410f606e5592b9 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -173,6 +173,7 @@ std::map> op_passing_outs_map = { {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"rnn", {"DropoutState"}}, {"run_program", {"Out", "DOut", "OutScope"}}, + {"dataloader", {"Out"}}, {"clear_float_status", {"FloatStatusOut"}}, {"get_float_status", {"FloatStatusOut"}}, }; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index ec9178ce1f4e51..08136ee3000485 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -38,6 +38,7 @@ def __init__(self, queue_depth=2): assert isinstance(queue_depth, int), \ "queue_depth should be an integer" self._queue_depth = queue_depth + self._init_programs() def _init_programs(self): self._main_program = fluid.Program() @@ -48,13 +49,17 @@ def _init_programs(self): def __enter__(self): # switch main and startup program - self._main_program = fluid.switch_main_program(self._main_program) - self._startup_program = fluid.switch_startup_program(self._startup_program) + paddle.enable_static() + self._main_program = framework.switch_main_program(self._main_program) + self._startup_program = framework.switch_startup_program( + self._startup_program) return self - def __exit__(self): - self._main_program = fluid.switch_main_program(self._main_program) - self._startup_program = fluid.switch_startup_program(self._startup_program) + def __exit__(self, exception_type, exception_value, traceback): + self._main_program = framework.switch_main_program(self._main_program) + self._startup_program = framework.switch_startup_program( + self._startup_program) + paddle.disable_static() def set_outputs(self, outputs): if isinstance(outputs, Sequence): @@ -69,14 +74,12 @@ def set_outputs(self, outputs): "outputs should be list, dict or Variable" def build(self): - self._output_vars = self._prepare_output_vars() global_block = self._main_program.desc.block(0) program_id = _hash_with_id(self._main_program, self) self._attrs = ('global_block', global_block, 'start_op_index', 0, - 'end_op_index', global_block.op_size(), - 'program_id', program_id) - + 'end_op_index', global_block.op_size(), 'program_id', + program_id) self._is_built = True def _prepare_output_vars(self): @@ -87,19 +90,26 @@ def _prepare_output_vars(self): var_desc = var.desc output_var = core.VarBase(var_desc.dtype(), var_desc.shape(), - var_desc.name(), - var_desc.type(), False) + var_desc.name(), var_desc.type(), False) output_vars.append(output_var) return output_vars + def __iter__(self): + return self + def __next__(self): assert self._is_built, \ "Pipeline not built, please call build() firstly" + self._output_vars = self._prepare_output_vars() + + # try: _C_ops.dataloader(self._output_vars, *self._attrs) - return {k: v for k, v in zip(self._output_vars, self._out_names)} + # except KeyboardInterrupt: + # pass + + return {k: v for k, v in zip(self._out_names, self._output_vars)} # Python 2 compatable def next(self): return self.__next__() - From 548bf840203f91ffc081deaede04c5135a8626f4 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 9 Nov 2021 03:07:26 +0000 Subject: [PATCH 05/95] add DataScope --- paddle/fluid/operators/data/CMakeLists.txt | 2 +- paddle/fluid/operators/data/data_scope.h | 280 +++++++++++++++++++++ paddle/fluid/operators/data/pipeline.h | 5 +- 3 files changed, 284 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/data/data_scope.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index cc6ee7c8e9b63f..38787d03cc64b0 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -4,7 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool) +cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope) op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/data_scope.h b/paddle/fluid/operators/data/data_scope.h new file mode 100644 index 00000000000000..92939fa86a2024 --- /dev/null +++ b/paddle/fluid/operators/data/data_scope.h @@ -0,0 +1,280 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +extern "C" { +#include +} + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" + +// When in inference scenario, the scopes will not be written by two threads in +// a mean time, but a scope may be read by multiple threads concurrently, and +// the mutex will cause serious performance issue. +// So the mutex is disabled when `ON_INFER`. +#ifdef PADDLE_ON_INFERENCE +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK +#else +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#endif + +namespace paddle { +namespace framework { + +class Scope; +class Variable; + +template +using BlockingQueue = operators::reader::BlockingQueue; + +/** + * @brief DataScope that manage all variables in data pipeline. + * + * In data pipeline, we need a queue between each OPs to buffer data + * to support data prefetch and OP running asynchronously, DataScope + * contains name -> Variable map as {name: BlockingQueue} + */ +class DataScope : public Scope { + public: + + DataScope() {} + + /// Create a sub-scope. Returns a reference other than a pointer so + /// to prevent from manual deletion. + /// Mark it to const because that new kid scope cannot change parent scope. + DataScope& NewScope() const { + DataScope* child = new DataScope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; + } + + /// Create a sub-scope for current scope but do not record it in the kids to + /// avoid performance problems. + std::unique_ptr NewTmpScope() const { + return std::unique_ptr(new DataScope(this)); + } + + // void EraseVars(const std::vector& var_names) { + // std::set var_set(var_names.begin(), var_names.end()); + // SCOPE_VARS_WRITER_LOCK + // for (auto it = var_queues_.begin(); it != var_queues_.end();) { + // if (var_set.find(it->first) != var_set.end()) { + // it = var_queues_.erase(it); + // } else { + // ++it; + // } + // } + // } + // + // void EraseVarsExcept(const std::unordered_set& vars) { + // SCOPE_VARS_WRITER_LOCK + // for (auto iter = var_queues_.begin(); iter != var_queues_.end();) { + // if (vars.count(iter->second.get()) != 0) { + // ++iter; + // } else { + // var_queues_.erase(iter++); + // } + // } + // } + + // /// Find a variable in the scope or any of its ancestors. Returns + // /// nullptr if cannot find. + // /// Caller doesn't own the returned Variable. + // Variable* FindVar(const std::string& name) const { + // SCOPE_VARS_READER_LOCK + // return FindVarInternal(name); + // } + + // // Get a variable in the scope or any of its ancestors. Enforce + // /// the returned Variable is not nullptr + // Variable* GetVar(const std::string& name) const { + // auto* var = FindVar(name); + // PADDLE_ENFORCE_NOT_NULL( + // var, platform::errors::NotFound("Cannot find %s in scope.", name)); + // return var; + // } + + /// Find a variable in the current scope. + /// Return nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindLocalVar(const std::string& name) const { + SCOPE_VARS_READER_LOCK + return FindVarLocally(name); + } + + const Scope* parent() const { return parent_; } + + /// Find the scope or an ancestor scope that contains the given variable. + // const Scope* FindScope(const Variable* var) const; + + // /// Find the scope or an ancestor scope that contains the given variable name. + // const Scope* FindScope(const std::string& name) const; + + // void DeleteScope(Scope* scope) const; + + // /// Drop all kids scopes belonged to this scope. + // void DropKids(); + + // /// Find if a scope exists in the kid scopes + // bool HasKid(const Scope* scope) const; + + // const std::list& kids() const { return kids_; } + + // enumerate all the variables current contains. + std::vector LocalVarNames() const { + std::vector known_vars; + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->var_queues_.size()); + for (auto& p : var_queues_) { + known_vars.emplace_back(p.first); + } + } + return known_vars; + } + + // // Rename variable to a new name + // void Rename(const std::string& origin_name, + // const std::string& new_name) const; + // + // // Rename variable to a new name and return the new name + // std::string Rename(const std::string& origin_name) const; + + protected: + // struct KeyHasher { + // std::size_t operator()(const std::string& key) const { + // return XXH32(key.c_str(), key.size(), 1); + // } + // }; + + mutable std::unordered_map>, KeyHasher> var_queues_; + + private: + // Call NewScope for a sub-scope. + explicit DataScope(Scope const* parent) : parent_(parent) {} + + // Called by Var. + Variable* VarInternal(const std::string& name) { + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + + auto q = GetBlockingQueue(name); + v = new Variable(); + q->Send(*v); + VLOG(3) << "Create Variable BlockingQueue and Create a Variable in it" << name; + return v; + } + + Variable* FindVarInternal(const std::string& name) const { + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); + } + + // // Called by FindScope. + // const Scope* FindScopeInternal(const Variable* var) const { + // for (auto& kv : var_queues_) { + // if (kv.second.get() == var) { + // return this; + // } + // } + // return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); + // } + + // // Called by FindScope. + const Scope* FindScopeInternal(const std::string& name) const { + if (var_queues_.find(name) != var_queues_.end()) { + return this; + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(name); + } + + // // Called by Rename. + void RenameInternal(const std::string& origin_name, + const std::string& new_name) const { + auto origin_it = var_queues_.find(origin_name); + PADDLE_ENFORCE_NE( + origin_it, var_queues_.end(), + platform::errors::NotFound( + "Original variable with name %s is not found in the scope.", + origin_name)); + auto new_it = var_queues_.find(new_name); + PADDLE_ENFORCE_EQ( + new_it, var_queues_.end(), + platform::errors::AlreadyExists( + "The variable with name %s already exists in the scope.", new_name)); + var_queues_[new_name].reset(origin_it->second.release()); + var_queues_.erase(origin_it); + } + + // Called by FindVarInternal and Var. + Variable* FindVarLocally(const std::string& name) const { + auto it = var_queues_.find(name); + if (it != var_queues_.end()) { + auto q = it->second.get(); + Variable* v = nullptr; + if (q->Size() <= 0 || !q->Receive(v)) { + return nullptr; + } + return v; + } + return nullptr; + } + + BlockingQueue* GetBlockingQueue(const std::string& name) const { + auto it = var_queues_.find(name); + if (it != var_queues_.end()) { + return it->second.get(); + } + auto q = new BlockingQueue(2); + var_queues_.emplace(name, std::unique_ptr>(q)); + return q; + } + + // Scope in `kids_` are owned by this class. + mutable std::list kids_; + const Scope* parent_{nullptr}; + + DISABLE_COPY_AND_ASSIGN(DataScope); + +#ifndef PADDLE_ON_INFERENCE + + private: + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; +#endif +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 247dcbef30a6f1..2b14b620ea9269 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -18,12 +18,13 @@ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/data/data_scope.h" namespace paddle { namespace operators { using BlockDesc = framework::BlockDesc; -using Scope = framework::Scope; +using DataScope = framework::DataScope; using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; @@ -74,7 +75,7 @@ class Pipeline { ThreadPool thread_pool_; std::atomic closed_; - Scope scope_; + DataScope scope_; std::shared_ptr global_block_; platform::Place place_; int64_t start_op_index_; From 9bd247bdcbdf94ce175f588ad44bf82bde338d68 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 9 Nov 2021 08:57:09 +0000 Subject: [PATCH 06/95] refine pipeline manager --- paddle/fluid/operators/data/pipeline.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 2b14b620ea9269..9bdeece9b9fdc2 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -95,9 +95,10 @@ class PipelineManager { DISABLE_COPY_AND_ASSIGN(PipelineManager); static PipelineManager *pm_instance_ptr_; - std::map> prog_id_to_pipeline_; static std::mutex m_; + std::map> prog_id_to_pipeline_; + public: static PipelineManager *Instance() { if (pm_instance_ptr_ == nullptr) { @@ -109,19 +110,19 @@ class PipelineManager { return pm_instance_ptr_; } - std::shared_ptr GetPipeline( + Pipeline* GetPipeline( int64_t program_id, BlockDesc *global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, const std::vector &output_var_names, size_t prefetch_queue_size) { auto iter = prog_id_to_pipeline_.find(program_id); if (iter == prog_id_to_pipeline_.end()) { - prog_id_to_pipeline_[program_id] = std::shared_ptr(new Pipeline( + prog_id_to_pipeline_[program_id] = std::unique_ptr(new Pipeline( std::shared_ptr(global_block), place, start_op_index, end_op_index, program_id, output_var_names, prefetch_queue_size)); - return prog_id_to_pipeline_[program_id]; + return prog_id_to_pipeline_[program_id].get(); } else { - return iter->second; + return iter->second.get(); } } From efec91b22a4db6ff431036ac3f1f5597fa465ae7 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 9 Nov 2021 15:37:09 +0000 Subject: [PATCH 07/95] add map_op and map_runner, compile success --- paddle/fluid/operators/data/CMakeLists.txt | 3 + paddle/fluid/operators/data/map_op.cc | 80 ++++++++ paddle/fluid/operators/data/map_op.cu.cc | 20 ++ paddle/fluid/operators/data/map_op.h | 94 ++++++++++ paddle/fluid/operators/data/map_runner.cc | 174 ++++++++++++++++++ paddle/fluid/operators/data/map_runner.h | 137 ++++++++++++++ paddle/fluid/operators/data/pipeline.h | 4 +- .../operators/data/unity_build_rule.cmake | 7 +- 8 files changed, 515 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/data/map_op.cc create mode 100644 paddle/fluid/operators/data/map_op.cu.cc create mode 100644 paddle/fluid/operators/data/map_op.h create mode 100644 paddle/fluid/operators/data/map_runner.cc create mode 100644 paddle/fluid/operators/data/map_runner.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 38787d03cc64b0..2f485732cff904 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -7,6 +7,9 @@ endif() cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope) op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) +cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) +op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) + # register_operators() # TODO: add test here diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc new file mode 100644 index 00000000000000..0fd5416a1a512e --- /dev/null +++ b/paddle/fluid/operators/data/map_op.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/data/map_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class MapOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutputs("X"), "Input", "X", "MapOp"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return expected_kernel_type; + } +}; + +class MapOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("In", + "(LoDTensorBlockingQueueHolder)" + "The output tensors of Map operator") + .AsDuplicable(); + AddOutput("Out", + "(LoDTensorBlockingQueueHolder)" + "The output tensors of Map operator") + .AsDuplicable(); + AddAttr("global_block", + "(BlockDesc *)" + "The global block of executed dataloader program " + "desc."); + AddAttr("start_op_index", + "(int64_t)" + "The index of the op to start execution"); + AddAttr("end_op_index", + "(int64_t)" + "The index of the op to stop execution"); + AddAttr("program_id", + "(int64_t)" + "The unique hash id used as cache key for " + "ExecutorInfoCache"); + AddComment(R"DOC( + Map Op + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(map, ops::MapOp, ops::MapOpMaker); +REGISTER_OP_CPU_KERNEL(map, ops::MapOpKernel); diff --git a/paddle/fluid/operators/data/map_op.cu.cc b/paddle/fluid/operators/data/map_op.cu.cc new file mode 100644 index 00000000000000..7f931b2a1281b2 --- /dev/null +++ b/paddle/fluid/operators/data/map_op.cu.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/data/map_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + map, + ops::MapOpKernel); diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h new file mode 100644 index 00000000000000..c6cea02cd09a71 --- /dev/null +++ b/paddle/fluid/operators/data/map_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/data/map_runner.h" + +namespace paddle { +namespace operators { + +using Variable = framework::Variable; +using LoDTensor = framework::LoDTensor; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; + + +static void CheckInputQueueStatus(const std::vector& vars) { + for (auto var : vars) { + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::InvalidArgument( + "Input Variables of MapOp should hold " + "LoDTensorBlockingQueueHolder type")); + auto queue = var->Get().GetQueue(); + PADDLE_ENFORCE_NE(queue, nullptr, + platform::errors::InvalidArgument( + "Input LoDTensorBlockingQueue is not initialized")); + } +} + +static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { + for (auto var : vars) { + if (var->IsInitialized()) { + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::InvalidArgument( + "Output Variables of MapOp should hold " + "LoDTensorBlockingQueueHolder type")); + auto queue = var->Get().GetQueue(); + PADDLE_ENFORCE_NE(queue, nullptr, + platform::errors::InvalidArgument( + "Input LoDTensorBlockingQueue is not initialized")); + } else { + // VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; + LOG(ERROR) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; + auto* holder = var->GetMutable(); + holder->InitOnce(capacity); + } + } +} + +static std::vector> GetQueueVecFromVariableVec(const std::vector& vars) { + std::vector> queues; + queues.reserve(vars.size()); + for (size_t i = 0; i < vars.size(); i++) { + queues.push_back(vars[i]->Get().GetQueue()); + } + return queues; +} + +template +class MapOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Step1: get output vars and attrs + auto input_vars = ctx.MultiInputVar("X"); + auto input_var_names = ctx.InputNames("X"); + auto output_vars = ctx.MultiOutputVar("Out"); + auto output_var_names = ctx.OutputNames("Out"); + + CheckInputQueueStatus(input_vars); + CheckAndInitOutputQueue(output_vars, /*capacity=*/2); + + auto* global_block = ctx.Attr("global_block"); + auto start_op_index = ctx.Attr("start_op_index"); + auto end_op_index = ctx.Attr("end_op_index"); + auto program_id = ctx.Attr("program_id"); + + auto input_queues = GetQueueVecFromVariableVec(input_vars); + auto output_queues = GetQueueVecFromVariableVec(output_vars); + data::MapRunnerManager::Instance()->StartMapRunner( + program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, + input_var_names, output_var_names, input_queues, output_queues); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc new file mode 100644 index 00000000000000..9a6173d7fc2570 --- /dev/null +++ b/paddle/fluid/operators/data/map_runner.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/data/map_runner.h" +#include "paddle/fluid/framework/executor_cache.h" + +namespace paddle { +namespace operators { +namespace data { + +MapRunner::MapRunner( + const std::shared_ptr global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, int64_t program_id, + const std::vector &input_var_names, + const std::vector &output_var_names, + const std::vector> input_queues, + const std::vector> output_queues) + : thread_pool_(1), + closed_(false), + global_block_(global_block), + place_(place), + start_op_index_(start_op_index), + end_op_index_(end_op_index), + program_id_(program_id), + input_var_names_(input_var_names), + output_var_names_(output_var_names), + input_queues_(input_queues), + output_queues_(output_queues) { + + VLOG(1) << "MapRunner init"; + + PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, + platform::errors::InvalidArgument( + "end_op_index should be greater than start_op_index, " + "but recieve %d <= %d.", + end_op_index_, start_op_index_)); + PADDLE_ENFORCE_EQ(input_var_names_.size(), input_queues_.size(), + platform::errors::InvalidArgument( + "input_var_names length should be equal to input_queues length, " + "but recieve %d != %d.", + input_var_names_.size(), + input_var_names_.size())); + PADDLE_ENFORCE_EQ(output_var_names_.size(), output_queues_.size(), + platform::errors::InvalidArgument( + "output_var_names length should be equal to output_queues length, " + "but recieve %d != %d.", + output_var_names_.size(), + output_var_names_.size())); + + // Step1: prepare executor + auto *program = global_block_->Program(); + auto cache_info = framework::GetExecutorInfoFromCache( + *program, place_, start_op_index_, end_op_index_, + /*is_grad=*/false, program_id, &scope_); + auto ¶llel_executor = cache_info.first; + + // Step2: parset persistable variables + auto &skip_eager_delete_vars = + framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, /*is_grad=*/false); + if (cache_info.second /*is_new_created*/) { + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_var_names.begin(), + output_var_names.end()); + framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_var_names, &skip_eager_delete_vars); + } + + // Step3: start prefetch thread + StartMapThread(parallel_executor, skip_eager_delete_vars); +} + +bool MapRunner::ShareInputsIntoScope() { + for (size_t i = 0; i < input_queues_.size(); i++) { + // If input queue closed, namely EOE(end of epoch) from + // dataset reader to here, read failed + auto queue = input_queues_[i]; + if (queue->IsClosed()) return false; + + // read LoDTensorArray + bool success = true; + auto lod_tensor_arr = queue->Pop(&success); + if (!success) return false; + + // read LoDTensor + auto tensor = lod_tensor_arr[0]; + if(!tensor.IsInitialized()) return false; + + // get input variable from scope and check status + auto name = input_var_names_[i]; + auto* var = scope_.Var(name); + if (!var->IsType() || !var->IsInitialized()) return false; + + // share input tensor to variable + auto* dst_tensor = var->GetMutable(); + dst_tensor->ShareDataWith(tensor); + dst_tensor->set_lod(tensor.lod()); + } + return true; +} + +void MapRunner::StartMapThread(std::shared_ptr executor, + const std::vector &skip_vars) { + thread_pool_.enqueue([this, executor, skip_vars]() -> void { + while (!closed_.load()) { + // Step1: get input LoDTensor and share into Scope + bool success = ShareInputsIntoScope(); + if (!success) { + Close(); + break; + } + + // Step2: run ops by executor without fetch + executor->RunWithoutFetch(skip_vars); + + // Step3: fetch output variable to LoDTensor vector + // and push to output queue + for (size_t i = 0; i < output_var_names_.size(); i++) { + framework::LoDTensorArray t_arr(1); + auto *out_var = scope_.FindVar(output_var_names_[i]); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "The output variable %s is not found in DataLoader " + "program's internal scope", + output_var_names_[i])); + CheckOutputVarStatus(*out_var, output_var_names_[i]); + copy_tensor(out_var->Get(), &t_arr[0]); + output_queues_[i]->Push(t_arr); + } + } + }); +} + +void MapRunner::CheckOutputVarStatus(const Variable &var, + const std::string &var_name) { + // only LoDTensor variable type support currently + PADDLE_ENFORCE_EQ( + var.IsType(), true, + platform::errors::InvalidArgument( + "The output variable %s get from DataLoader program's " + "internal scope holds wrong type. Expect type is " + "LoDTensor, but receive type is %s.", + var_name, platform::demangle(framework::ToTypeName(var.Type())))); + PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from DataLoader " + "program's internal scope is not initialized.", + var_name)); +} + +inline void MapRunner::Close() { + VLOG(1) << "MapRunner close"; + for (auto queue : output_queues_) { + queue->Close(); + } + closed_ = true; +} + +// initialization static variables out of MapRunnerManager +MapRunnerManager *MapRunnerManager::pm_instance_ptr_ = nullptr; +std::mutex MapRunnerManager::m_; + +} // data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h new file mode 100644 index 00000000000000..3495810c8d0455 --- /dev/null +++ b/paddle/fluid/operators/data/map_runner.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "ThreadPool.h" + +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/data/data_scope.h" + +namespace paddle { +namespace operators { + +using BlockDesc = framework::BlockDesc; +using Scope = framework::Scope; +using ParallelExecutor = framework::ParallelExecutor; + +using Variable = framework::Variable; +using LoDTensor = framework::LoDTensor; +using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; + +namespace data { + +class MapRunner { + public: + MapRunner(const std::shared_ptr global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, int64_t program_id, + const std::vector &input_var_names, + const std::vector &output_var_names, + const std::vector> input_queues, + const std::vector> output_queues); + + // ~MapRunner() { + // VLOG(1) << "~MapRunner"; + // Close(); + // } + + inline bool IsClosed() { return closed_; } + + inline void Close(); + + inline void Reset(); + + private: + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; + auto &out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + bool ShareInputsIntoScope(); + void StartMapThread(std::shared_ptr executor, + const std::vector &skip_vars); + + void CheckInputVarStatus(const Variable &var, const std::string &var_name); + void CheckOutputVarStatus(const Variable &var, const std::string &var_name); + + ThreadPool thread_pool_; + std::atomic closed_; + + Scope scope_; + std::shared_ptr global_block_; + platform::Place place_; + int64_t start_op_index_; + int64_t end_op_index_; + int64_t program_id_; + + std::vector input_var_names_; + std::vector output_var_names_; + std::vector> input_queues_; + std::vector> output_queues_; +}; + +class MapRunnerManager { + // MapRunnerManager is a signleton manager for MapRunner, we + // create single MapRunner for a program id + private: + DISABLE_COPY_AND_ASSIGN(MapRunnerManager); + + static MapRunnerManager *pm_instance_ptr_; + static std::mutex m_; + + std::map> prog_id_to_runner_; + + public: + static MapRunnerManager *Instance() { + if (pm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); + if (pm_instance_ptr_ == nullptr) { + pm_instance_ptr_ = new MapRunnerManager; + } + } + return pm_instance_ptr_; + } + + void StartMapRunner( + int64_t program_id, BlockDesc *global_block, const platform::Place &place, + int64_t start_op_index, int64_t end_op_index, + const std::vector &input_var_names, + const std::vector &output_var_names, + const std::vector> &input_queues, + const std::vector> &output_queues) { + auto iter = prog_id_to_runner_.find(program_id); + if (iter == prog_id_to_runner_.end()) { + prog_id_to_runner_[program_id] = std::unique_ptr(new MapRunner( + std::shared_ptr(global_block), place, start_op_index, + end_op_index, program_id, input_var_names, output_var_names, + input_queues, output_queues)); + } + } + + MapRunnerManager() { VLOG(1) << "MapRunnerManager init"; } + + ~MapRunnerManager() { + VLOG(1) << "~MapRunnerManager"; + prog_id_to_runner_.clear(); + } +}; + +} // data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 9bdeece9b9fdc2..2f0f10abe65579 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -24,7 +24,7 @@ namespace paddle { namespace operators { using BlockDesc = framework::BlockDesc; -using DataScope = framework::DataScope; +using Scope = framework::Scope; using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; @@ -75,7 +75,7 @@ class Pipeline { ThreadPool thread_pool_; std::atomic closed_; - DataScope scope_; + Scope scope_; std::shared_ptr global_block_; platform::Place place_; int64_t start_op_index_; diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index 1fe36081c76c8a..a6c05c6a3623f7 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -6,7 +6,10 @@ # in combination rule, you can remove the source file from the following rules. register_unity_group(cc pipeline.cc - dataloader_op.cc) + map_runner.cc + dataloader_op.cc + map_op.cc) register_unity_group(cu - dataloader_op.cu.cc) + dataloader_op.cu.cc + map_op.cu.cc) From 798331b0f0d6dd9acc67c7e23a4e9ebe4451cce0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 10 Nov 2021 15:44:26 +0000 Subject: [PATCH 08/95] add python API and VarType LOD_TENSOR_BLOCKING_QUEUE --- paddle/fluid/framework/framework.proto | 2 + paddle/fluid/framework/var_type_traits.h | 2 + paddle/fluid/operators/data/map_op.cc | 8 ++- paddle/fluid/operators/data/map_op.h | 4 +- paddle/fluid/pybind/op_function_generator.cc | 2 + paddle/fluid/pybind/protobuf.cc | 1 + python/paddle/fluid/dataloader/ops.py | 70 ++++++++++++++++++++ 7 files changed, 86 insertions(+), 3 deletions(-) create mode 100755 python/paddle/fluid/dataloader/ops.py diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 300d5f6e8fad10..831259f8ab3bf9 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -152,6 +152,8 @@ message VarType { STRINGS = 26; VOCAB = 27; FEED_LIST = 28; + + LOD_TENSOR_BLOCKING_QUEUE = 31; } required Type type = 1; diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index f4c41197a9dfa8..c95a122832863c 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -210,6 +210,8 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(operators::reader::LoDTensorBlockingQueueHolder, + proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST); REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 0fd5416a1a512e..3339c00dbe1031 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -54,7 +54,7 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddAttr("global_block", "(BlockDesc *)" - "The global block of executed dataloader program " + "The global block of executed map program " "desc."); AddAttr("start_op_index", "(int64_t)" @@ -66,6 +66,12 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "(int64_t)" "The unique hash id used as cache key for " "ExecutorInfoCache"); + AddAttr>("input_var_names", + "(list of string)" + "input variable names for map program"); + AddAttr>("output_var_names", + "(list of string)" + "output variable names for map program"); AddComment(R"DOC( Map Op )DOC"); diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index c6cea02cd09a71..fbd4865e57a916 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -70,13 +70,13 @@ class MapOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { // Step1: get output vars and attrs auto input_vars = ctx.MultiInputVar("X"); - auto input_var_names = ctx.InputNames("X"); auto output_vars = ctx.MultiOutputVar("Out"); - auto output_var_names = ctx.OutputNames("Out"); CheckInputQueueStatus(input_vars); CheckAndInitOutputQueue(output_vars, /*capacity=*/2); + auto input_var_names = ctx.Attr>("input_var_names"); + auto output_var_names = ctx.Attr>("output__var_names"); auto* global_block = ctx.Attr("global_block"); auto start_op_index = ctx.Attr("start_op_index"); auto end_op_index = ctx.Attr("end_op_index"); diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 6b1261214bd009..260541931aa9fe 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -72,6 +72,7 @@ std::map> op_ins_map = { {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, + {"map", {"X"}}, {"fused_feedforward", {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale", "Ln1Bias", "Ln2Scale", "Ln2Bias"}}, @@ -187,6 +188,7 @@ std::map> op_passing_outs_map = { {"rnn", {"DropoutState"}}, {"run_program", {"Out", "DOut", "OutScope"}}, {"dataloader", {"Out"}}, + {"map", {"Out"}}, {"clear_float_status", {"FloatStatusOut"}}, {"get_float_status", {"FloatStatusOut"}}, }; diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 984f3d1a31cce4..b2905bfd664ac3 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -227,6 +227,7 @@ void BindVarDsec(pybind11::module *m) { .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) + .value("LOD_TENSOR_BLOCKING_QUEUE", pd::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE) .value("RAW", pd::proto::VarType::RAW) .value("STRING", pd::proto::VarType::STRING) .value("STRINGS", pd::proto::VarType::STRINGS) diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py new file mode 100755 index 00000000000000..d180c396698b25 --- /dev/null +++ b/python/paddle/fluid/dataloader/ops.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +import paddle.static as static + +from paddle.fluid import core, framework +from paddle.fluid.layers.utils import _hash_with_id +from paddle.common_ops_import import * + + +__all__ = ["map"] + + +def map(map_func, inputs): + assert not in_dygraph_mode(), \ + "paddle.io.map can only be used in static mode" + helper = LayerHelper("map", **locals()) + + # inputs are Variables hold LoDTensorBlockingQueue + # TODO: cannot get tensor shape from LoDTensorBlockingQueue + program_inputs = [static.data('input_{}'.format(i), [None]) for i in range(len(inputs))] + + # build map program + main_program = fluid.Program() + startup_program = fluid.Program() + with static.guard(main_program, startup_program): + program_outputs = map_func(*program_inputs) + + input_var_names = [v.name for v in program_inputs] + output_var_names = [v.name for v in program_outputs] + + global_block = self._main_program.desc.block(0) + program_id = _hash_with_id(main_program, map_func) + + outputs = \ + [helper.create_variable( + name=unique_name.generate("map"), + type=core.VarDesc.VarType.LOD_TENSOR_BLOCKING_QUEUE, + persistable=True) for _ in range(len(program_outputs))] + attrs = { + "global_block": global_block, + "program_id": program_id, + "start_op_index": 0, + "end_op_index": global_block.op_size(), + "input_var_names": input_var_names, + "output_var_names": output_var_names + } + + helper.append_op( + type="map", + inputs={"X": inputs}, + outputs={"Out": outputs}, + attrs=attrs) + + return outputs From d66ed311b7d22a305f860d5747199106446a0258 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 10 Nov 2021 16:18:01 +0000 Subject: [PATCH 09/95] add Shutdown for MapRunner --- paddle/fluid/operators/data/map_runner.cc | 17 +++++++++++------ paddle/fluid/operators/data/map_runner.h | 15 +++++++++++---- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 9a6173d7fc2570..233b8d10cb0e4f 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -25,7 +25,7 @@ MapRunner::MapRunner( const std::vector> input_queues, const std::vector> output_queues) : thread_pool_(1), - closed_(false), + running_(true), global_block_(global_block), place_(place), start_op_index_(start_op_index), @@ -111,11 +111,11 @@ bool MapRunner::ShareInputsIntoScope() { void MapRunner::StartMapThread(std::shared_ptr executor, const std::vector &skip_vars) { thread_pool_.enqueue([this, executor, skip_vars]() -> void { - while (!closed_.load()) { + while (running_.load()) { // Step1: get input LoDTensor and share into Scope bool success = ShareInputsIntoScope(); if (!success) { - Close(); + Shutdown(); break; } @@ -157,12 +157,17 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, var_name)); } -inline void MapRunner::Close() { - VLOG(1) << "MapRunner close"; +void MapRunner::Shutdown() { + VLOG(1) << "MapRunner shutdown"; + // close all output queue, op after this op can shutdown itself for (auto queue : output_queues_) { queue->Close(); } - closed_ = true; + + // set running_ as false to exit map thread, then release thread pool + running_ = false; + // FIXME: ThreadPool doesn't have shutdown method + delete &thread_pool_; } // initialization static variables out of MapRunnerManager diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 3495810c8d0455..9139158b994c87 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -48,11 +48,10 @@ class MapRunner { // Close(); // } - inline bool IsClosed() { return closed_; } + void Shutdown(); - inline void Close(); + inline bool IsRunning() { return running_; } - inline void Reset(); private: void copy_tensor(const framework::LoDTensor &lod_tensor, @@ -71,7 +70,7 @@ class MapRunner { void CheckOutputVarStatus(const Variable &var, const std::string &var_name); ThreadPool thread_pool_; - std::atomic closed_; + std::atomic running_; Scope scope_; std::shared_ptr global_block_; @@ -124,6 +123,14 @@ class MapRunnerManager { } } + void ShutdownMapRunner(int program_id) { + auto iter = prog_id_to_runner_.find(program_id); + if (iter != prog_id_to_runner_.end()) { + iter->second.get()->Shutdown(); + prog_id_to_runner_.erase(iter); + } + } + MapRunnerManager() { VLOG(1) << "MapRunnerManager init"; } ~MapRunnerManager() { From 5ad0af5f22f07084dbc7c3f1f1c05ac3d94eaae0 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Thu, 28 Oct 2021 02:32:15 +0000 Subject: [PATCH 10/95] add file reader --- .../fluid/operators/file_label_reader_op.cc | 275 ++++++++++++++++++ python/paddle/vision/ops.py | 52 ++++ 2 files changed, 327 insertions(+) create mode 100644 paddle/fluid/operators/file_label_reader_op.cc diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc new file mode 100644 index 00000000000000..19767c56a616e6 --- /dev/null +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -0,0 +1,275 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +using LoDTensorArray = framework::LoDTensorArray; + +enum BufferStatus { + kBufferStatusSuccess = 0, + kBufferStatusErrorClosed, + kBufferStatusEmpty +}; + +template +class Buffer final { + public: + explicit Buffer(size_t max_len = 2) : max_len_(max_len), is_closed_(false) {} + ~Buffer() = default; + + BufferStatus Push(const T& item); + BufferStatus Pull(T* item); + BufferStatus TryReceive(T* item); + void Close(); + + private: + std::queue queue_; + mutable std::mutex mutex_; + size_t max_len_; + bool is_closed_; + std::condition_variable cond_; +}; + +template +BufferStatus Buffer::Push(const T& item) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]() { return queue_.size() < max_len_ || is_closed_; }); + if (is_closed_) { + return kBufferStatusErrorClosed; + } + + queue_.push(item); + cond_.notify_one(); + return kBufferStatusSuccess; +} + +template +BufferStatus Buffer::Pull(T* item) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]() { return (!queue_.empty()) || is_closed_; }); + if (queue_.empty()) { + return kBufferStatusErrorClosed; + } + *item = queue_.front(); + queue_.pop(); + if (queue_.size() < max_len_) { + cond_.notify_all(); + } + return kBufferStatusSuccess; +} + +template +void Buffer::Close() { + std::unique_lock lock(mutex_); + is_closed_ = true; + cond_.notify_all(); +} + +class FileDataReader { + public: + explicit FileDataReader(const framework::ExecutionContext& ctx) { + std::vector files = + ctx.Attr>("files"); + std::vector labels = ctx.Attr>("labels"); + rank_ = ctx.Attr("rank"); + world_size_ = ctx.Attr("world_size"); + std::cout << "files and labels size: " << files.size() << " " + << labels.size() << std::endl; + batch_size_ = ctx.Attr("batch_size"); + current_epoch_ = 0; + current_iter_ = 0; + is_closed_ = false; + for (int i = 0, n = files.size(); i < n; i++) + image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); + StartLoadThread(); + } + + int GetStartIndex() { + return batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; + } + + framework::LoDTensor ReadSample(const std::string filename) { + std::ifstream input(filename.c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + std::streamsize file_size = input.tellg(); + + input.seekg(0, std::ios::beg); + + // auto* out = ctx.Output("Out"); + framework::LoDTensor out; + std::vector out_shape = {file_size}; + out.Resize(framework::make_ddim(out_shape)); + + uint8_t* data = out.mutable_data(platform::CPUPlace()); + + input.read(reinterpret_cast(data), file_size); + return out; + } + + void StartLoadThread() { + if (load_thrd_.joinable()) { + return; + } + + load_thrd_ = std::thread([this] { + while (!is_closed_.load() && LoadBatch()) { + } + }); + } + + LoDTensorArray Read() { + LoDTensorArray ret; + ret.reserve(batch_size_); + int start_index = GetStartIndex(); + for (int32_t i = start_index; i < start_index + batch_size_; ++i) { + framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); + ret.push_back(std::move(tmp)); + } + return ret; + } + + LoDTensorArray Next() { + LoDTensorArray batch_data; + batch_buffer_.Pull(&batch_data); + return batch_data; + } + + bool LoadBatch() { + std::cout << "start LoadBatch 0.01" << std::endl; + LoDTensorArray batch_data = std::move(Read()); + return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; + } + + private: + int batch_size_; + std::string file_root_, file_list_; + std::vector> image_label_pairs_; + int current_epoch_; + int current_iter_; + int rank_; + int world_size_; + std::atomic is_closed_; + Buffer batch_buffer_; + std::thread load_thrd_; +}; + +class FileDataReaderWrapper { + public: + void SetUp(const framework::ExecutionContext& ctx) { + reader.reset(new FileDataReader(ctx)); + } + + std::shared_ptr reader = nullptr; +}; + +FileDataReaderWrapper reader_wrapper; + +template +class CPUFileLabelKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + if (reader_wrapper.reader == nullptr) { + // create reader + reader_wrapper.SetUp(ctx); + } + LoDTensorArray samples = reader_wrapper.reader->Next(); + auto* out = ctx.OutputVar("Out"); + auto& out_array = *out->GetMutable(); + out_array.resize(samples.size()); + for (size_t i = 0; i < samples.size(); ++i) { + copy_tensor(samples[i], &out_array[i]); + } + } + + private: + void copy_tensor(const framework::LoDTensor& lod_tensor, + framework::LoDTensor* out) const { + if (lod_tensor.numel() == 0) return; + auto& out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } +}; + +class FileLabelReaderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of ReadFileOp is null.")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } +}; + +class FileLabelReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + // ctx->SetOutputType("SentenceScores", + // framework::proto::VarType::LOD_TENSOR, + // framework::ALL_ELEMENTS); + } +}; + +class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "The output tensor of ReadFile op"); + AddComment(R"DOC( +This operator read a file. +)DOC"); + AddAttr("root_dir", "Path of the file to be readed.") + .SetDefault(""); + AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); + AddAttr("rank", "Path of the file to be readed.").SetDefault(0); + AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); + AddAttr>("files", "Path of the file to be readed.") + .SetDefault({}); + AddAttr>("labels", "Path of the file to be readed.") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + file_label_reader, ops::FileLabelReaderOp, ops::FileLabelReaderOpMaker, + ops::FileLabelReaderInferVarType, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(file_label_reader, ops::CPUFileLabelKernel) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 965cf8b55e7936..9553ecdf96aecf 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -861,6 +861,58 @@ def read_file(filename, name=None): return out +def file_label_reader(file_root, batch_size, name=None): + """ + Reads and outputs the bytes contents of a file as a uint8 Tensor + with one dimension. + + Args: + filename (str): Path of the file to be read. + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + + Returns: + A uint8 tensor. + + Examples: + .. code-block:: python + + import cv2 + import paddle + + image = paddle.vision.ops.file_label_reader('/workspace/datasets/ILSVRC2012/val/', 2) + + """ + from paddle.vision.datasets import DatasetFolder + data_folder = DatasetFolder(file_root) + samples = [s[0] for s in data_folder.samples] + targets = [s[1] for s in data_folder.samples] + + if in_dygraph_mode(): + return _C_ops.file_label_reader('root_dir', file_root, 'batch_size', + batch_size, 'files', samples, 'labels', + targets) + + inputs = dict() + attrs = { + 'root_dir': root_dir, + 'batch_size': batch_size, + 'files': samples, + 'labels': targets + } + + helper = LayerHelper("file_label_reader", **locals()) + out = helper.create_variable_for_type_inference('uint8') + helper.append_op( + type="file_label_reader", + inputs=inputs, + attrs=attrs, + outputs={"Out": out}) + + return out + + def decode_jpeg(x, mode='unchanged', name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. From 8ba9417d5b42d653e9522138c26076d5f8247af0 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Wed, 3 Nov 2021 12:30:22 +0000 Subject: [PATCH 11/95] add decode --- paddle/fluid/operators/decode_op.cc | 121 ++++++++++++++ paddle/fluid/operators/decode_op.cu | 151 ++++++++++++++++++ .../fluid/operators/file_label_reader_op.cc | 83 ++++++---- python/paddle/vision/ops.py | 61 ++++++- 4 files changed, 381 insertions(+), 35 deletions(-) create mode 100644 paddle/fluid/operators/decode_op.cc create mode 100644 paddle/fluid/operators/decode_op.cu diff --git a/paddle/fluid/operators/decode_op.cc b/paddle/fluid/operators/decode_op.cc new file mode 100644 index 00000000000000..e2ca1d20357271 --- /dev/null +++ b/paddle/fluid/operators/decode_op.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUBatchDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "DecodeJpeg op only supports GPU now.")); + } +}; + +class BatchDecodeJpegOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + auto mode = ctx->Attrs().Get("mode"); + std::vector out_dims; + + if (mode == "unchanged") { + out_dims = {-1, -1, -1}; + } else if (mode == "gray") { + out_dims = {1, -1, -1}; + } else if (mode == "rgb") { + out_dims = {3, -1, -1}; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU: ", mode)); + } + + // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class BatchDecodeJpegInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + } +}; + +class BatchDecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + decode, ops::BatchDecodeJpegOp, ops::BatchDecodeJpegOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(decode, ops::CPUBatchDecodeJpegKernel) diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu new file mode 100644 index 00000000000000..bc0a464254a7b0 --- /dev/null +++ b/paddle/fluid/operators/decode_op.cu @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { + +static cudaStream_t batch_nvjpeg_stream = nullptr; +static nvjpegHandle_t batch_nvjpeg_handle = nullptr; + +void batch_InitNvjpegImage(nvjpegImage_t* img) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + img->channel[c] = nullptr; + img->pitch[c] = 0; + } +} + +template +class GPUBatchDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Create nvJPEG handle + if (batch_nvjpeg_handle == nullptr) { + nvjpegStatus_t create_status = + platform::dynload::nvjpegCreateSimple(&batch_nvjpeg_handle); + + PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegCreateSimple failed: ", + create_status)); + } + + nvjpegJpegState_t nvjpeg_state; + nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( + batch_nvjpeg_handle, &nvjpeg_state); + + PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegJpegStateCreate failed: ", + state_status)); + + const framework::LoDTensorArray* ins = + ctx.Input("X"); + // auto* outs = ctx.Output("Out"); + // auto *out = scope.FindVar(Output("Out")); + // std::cout << "debug 0.05" << std::endl; + // auto &out_array = outs->GetMutable(); + auto* out = ctx.OutputVar("Out"); + auto& out_array = *out->GetMutable(); + out_array.resize(ins->size()); + std::cout << "decode: " << ins->size() << std::endl; + for (int i = 0; i < ins->size(); i++) { + const framework::LoDTensor x = ins->at(i); + framework::LoDTensor out = out_array.at(i); + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + auto* x_data = x.data(); + + nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, + &subsampling, widths, heights); + + PADDLE_ENFORCE_EQ( + info_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + auto mode = ctx.Attr("mode"); + if (mode == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + nvjpegImage_t out_image; + batch_InitNvjpegImage(&out_image); + + // create nvjpeg stream + if (batch_nvjpeg_stream == nullptr) { + cudaStreamCreateWithFlags(&batch_nvjpeg_stream, cudaStreamNonBlocking); + } + + int sz = widths[0] * heights[0]; + + // auto* out = ctx.Output("Out"); + std::vector out_shape = {output_components, height, width}; + out.Resize(framework::make_ddim(out_shape)); + + T* data = out.mutable_data(ctx.GetPlace()); + + for (int c = 0; c < output_components; c++) { + out_image.channel[c] = data + c * sz; + out_image.pitch[c] = width; + } + + nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, + &out_image, batch_nvjpeg_stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(decode, ops::GPUBatchDecodeJpegKernel) + +#endif diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 19767c56a616e6..0094d2b34c3813 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -189,13 +189,44 @@ FileDataReaderWrapper reader_wrapper; template class CPUFileLabelKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +class FileLabelReaderOp : public framework::OperatorBase { + public: + // using framework::OperatorWithKernel::OperatorWithKernel; + FileLabelReaderOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of ReadFileOp is null.")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); if (reader_wrapper.reader == nullptr) { // create reader reader_wrapper.SetUp(ctx); } LoDTensorArray samples = reader_wrapper.reader->Next(); - auto* out = ctx.OutputVar("Out"); + auto* out = scope.FindVar(Output("Out")); auto& out_array = *out->GetMutable(); out_array.resize(samples.size()); for (size_t i = 0; i < samples.size(); ++i) { @@ -203,7 +234,6 @@ class CPUFileLabelKernel : public framework::OpKernel { } } - private: void copy_tensor(const framework::LoDTensor& lod_tensor, framework::LoDTensor* out) const { if (lod_tensor.numel() == 0) return; @@ -211,35 +241,8 @@ class CPUFileLabelKernel : public framework::OpKernel { TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } -}; - -class FileLabelReaderOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ReadFileOp is null.")); - } - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(framework::proto::VarType::UINT8, - platform::CPUPlace()); - } -}; - -class FileLabelReaderInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - // ctx->SetOutputType("SentenceScores", - // framework::proto::VarType::LOD_TENSOR, - // framework::ALL_ELEMENTS); - } + // std::shared_ptr reader=nullptr; }; class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { @@ -261,6 +264,22 @@ This operator read a file. } }; +class FileLabelReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", + "FileLabelReader"); + } +}; + +class FileLabelReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + } +}; + } // namespace operators } // namespace paddle @@ -268,7 +287,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR( file_label_reader, ops::FileLabelReaderOp, ops::FileLabelReaderOpMaker, - ops::FileLabelReaderInferVarType, + ops::FileLabelReaderInferShape, ops::FileLabelReaderInferVarType, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 9553ecdf96aecf..4a937d6aa9b9af 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -13,7 +13,7 @@ # limitations under the License. import numpy as np -from ..fluid.layer_helper import LayerHelper +from ..fluid.layer_helper import LayerHelper, unique_name from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import core, layers from ..fluid.layers import nn, utils @@ -896,14 +896,18 @@ def file_label_reader(file_root, batch_size, name=None): inputs = dict() attrs = { - 'root_dir': root_dir, + 'root_dir': file_root, 'batch_size': batch_size, 'files': samples, 'labels': targets } helper = LayerHelper("file_label_reader", **locals()) - out = helper.create_variable_for_type_inference('uint8') + # out = helper.create_variable_for_type_inference('uint8') + out = helper.create_variable( + name=unique_name.generate("file_label_reader"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype='uint8') helper.append_op( type="file_label_reader", inputs=inputs, @@ -913,6 +917,57 @@ def file_label_reader(file_root, batch_size, name=None): return out +def image_decode(x, mode='unchanged', name=None): + """ + Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. + Optionally converts the image to the desired format. + The values of the output tensor are uint8 between 0 and 255. + + Args: + x (Tensor): A one dimensional uint8 tensor containing the raw bytes + of the JPEG image. + mode (str): The read mode used for optionally converting the image. + Default: 'unchanged'. + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width) + + Examples: + .. code-block:: python + import cv2 + import paddle + + fake_img = (np.random.random( + (400, 300, 3)) * 255).astype('uint8') + + cv2.imwrite('fake.jpg', fake_img) + + img_bytes = paddle.vision.ops.read_file('fake.jpg') + img = paddle.vision.ops.decode_jpeg(img_bytes) + + print(img.shape) + """ + + if in_dygraph_mode(): + return _C_ops.decode(x, "mode", mode) + + inputs = {'X': x} + attrs = {"mode": mode} + + helper = LayerHelper("image_decode", **locals()) + out = helper.create_variable( + name=unique_name.generate("image_decode"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=x.dtype) + # out = helper.create_variable_for_type_inference('uint8') + helper.append_op( + type="decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) + + return out + + def decode_jpeg(x, mode='unchanged', name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. From 88b78094ebbb3f9910d9748106704de711ea7a71 Mon Sep 17 00:00:00 2001 From: ghostxsl <451323469@qq.com> Date: Tue, 9 Nov 2021 14:49:00 +0800 Subject: [PATCH 12/95] [data op] add random_crop_and_resize_op --- .../operators/random_crop_and_resize_op.cc | 137 ++++++++ .../operators/random_crop_and_resize_op.cu | 326 ++++++++++++++++++ .../operators/random_crop_and_resize_op.h | 28 ++ python/paddle/vision/ops.py | 93 +++++ 4 files changed, 584 insertions(+) create mode 100644 paddle/fluid/operators/random_crop_and_resize_op.cc create mode 100644 paddle/fluid/operators/random_crop_and_resize_op.cu create mode 100644 paddle/fluid/operators/random_crop_and_resize_op.h diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/random_crop_and_resize_op.cc new file mode 100644 index 00000000000000..143cafc8ed8d3c --- /dev/null +++ b/paddle/fluid/operators/random_crop_and_resize_op.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/random_crop_and_resize_op.h" + +namespace paddle { +namespace operators { + +class RandomCropAndResizeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RandomCropAndResize"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "RandomCropAndResize"); + + auto size = ctx->Attrs().Get>("size"); + PADDLE_ENFORCE_EQ(size.size(), 2, + platform::errors::InvalidArgument( + "The length of Attrs(size) should be 2.")); + PADDLE_ENFORCE_GT(size[0], 0, + platform::errors::InvalidArgument( + "h in Attr(size) of Op(RandomCropAndResize) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(size[1], 0, + platform::errors::InvalidArgument( + "w in Attr(size) of Op(RandomCropAndResize) " + "should be greater than 0.")); + auto x_dim = ctx->GetInputsDim("X"); // NCHW format + + std::vector out_dim = {static_cast(x_dim.size()), + x_dim[0][0], size[0], size[1]}; + ctx->SetOutputDim("Out", framework::make_ddim({out_dim})); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "X") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class RandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); + AddOutput("Out", "(Tensor). The cropped instance batch."); + AddAttr>( + "size", "expected output size of the crop, for each edge."); + AddAttr>( + "scale", + "Specifies the lower and upper bounds" + "for the random area of the crop, before resizing."); + AddAttr>( + "ratio", + "lower and upper bounds for the random aspect ratio of the crop, " + "before resizing."); + AddAttr("interp_method", + "(string, default \"bilinear\"), interpolation " + "method, can be \"bilinear\" for " + "bilinear interpolation and \"nearest\" for nearest " + "neighbor interpolation.") + .SetDefault("bilinear"); + AddAttr( + "align_corners", + "an optional bool. Defaults to True. " + "If True, the centers of 4 corner pixels of the input and output " + "tensors are aligned, preserving the values at the corner pixels, " + "If False, are not aligned") + .SetDefault(true); + AddAttr("align_mode", + "(int, default \'1\'), optional for bilinear interpolation, " + "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " + "can be \'1\' for src_idx = scale*dst_index .") + .SetDefault(1); + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); + AddAttr("seed", "The random seed. ").SetDefault(0); + AddComment(R"DOC( + Crop the input data to random size and aspect ratio. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. + After applying crop transfrom, the input data will be resized to given size. + )DOC"); + } +}; + +template +class RandomCropAndResizeCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // no cpu kernel. + PADDLE_THROW(platform::errors::Unimplemented( + "RandomCropAndResize op only supports GPU now.")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + random_crop_and_resize, ops::RandomCropAndResizeOp, + ops::RandomCropAndResizeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(random_crop_and_resize, + ops::RandomCropAndResizeCPUKernel) diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu new file mode 100644 index 00000000000000..3381d9f2d2703c --- /dev/null +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -0,0 +1,326 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/random_crop_and_resize_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using DataLayout = framework::DataLayout; + +template +__global__ void KeNearestNeighborInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w, + const size_t idx_h, const size_t idx_w, const bool align_corners, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + // batch size + int out_id_h = tid / output_w; + // single image's index + int out_id_w = tid % output_w; + // input_w or output_w = c * h * w + // img_size = h * w + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + // get output c, h, w index + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + // get input h index with offset + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy += idx_h; + // get input w index with offset + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx += idx_w; + + if (data_layout == DataLayout::kNCHW) { + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } else { + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + } +} + +template +__global__ void KeBilinearInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w, + const size_t idx_h, const size_t idx_w, const bool align_corners, + const int align_mode, const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + // batch size + int out_id_h = tid / output_w; + // single image's index + int out_id_w = tid % output_w; + // input_w or output_w = c * h * w + // img_size = h * w + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + // get output c, h, w index + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + // get input h index with offset + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy + idx_h : idx_h; + int h_id = (in_img_idy < in_img_h + idx_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h + idx_h : idx_h; + T h1lambda = align_flag ? src_h - in_img_idy + : ratio_h * out_img_idy + idx_h - in_img_idy; + T h2lambda = 1.f - h1lambda; + + // get input w index with offset + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx + idx_w : idx_w; + int w_id = (in_img_idx < in_img_w + idx_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w + idx_w : idx_w; + T w1lambda = align_flag ? src_w - in_img_idx + : ratio_w * out_img_idx + idx_w - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + } else { + const T* in_pos = + &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * + (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * in_pos[h_id * in_img_w * num_channels + + w_id * num_channels]); + } + } +} + +template +static void RandomCropAndResizeFwd( + const framework::ExecutionContext& ctx, const framework::LoDTensor& input, + framework::Tensor* output, const std::vector out_size, + const std::string interp_method, const bool align_corners, + const int align_mode, const int img_h, const int img_w, const int c, + const int idx_h, const int idx_w, const int crop_h, const int crop_w, + const DataLayout data_layout) { + auto input_data = input.template data(); + int out_h = out_size[0]; + int out_w = out_size[1]; + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {c, out_h, out_w}; + } else { + dim_out = {out_h, out_w, c}; + } + auto output_data = output->template mutable_data(ctx.GetPlace()); + + if (img_h == crop_h && img_w == crop_w) { + framework::TensorCopy(input, ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(crop_h - 1) / (out_h - 1) + : static_cast(crop_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(crop_w - 1) / (out_w - 1) + : static_cast(crop_w) / out_w; + } + + int in_chw = c * crop_h * crop_w; + int out_chw = c * out_h * out_w; + + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_chw); + + if ("nearest" == interp_method) { + KeNearestNeighborInterpFw< + T><<>>( + input_data, crop_h, crop_w, 1, in_chw, output_data, out_h, out_w, 1, + out_chw, c, ratio_h, ratio_w, idx_h, idx_w, align_corners, data_layout); + } else if ("bilinear" == interp_method) { + KeBilinearInterpFw<<>>( + input_data, crop_h, crop_w, 1, in_chw, output_data, out_h, out_w, 1, + out_chw, c, ratio_h, ratio_w, idx_h, idx_w, align_corners, align_mode, + data_layout); + } +} + +static void GetCropParameters(const int height, const int width, + const std::vector scale, + const std::vector ratio, int* idx_h, + int* idx_w, int* crop_h, int* crop_w, + const int seed, int num_attempts = 10) { + double target_area, aspect_ratio; + double area = height * width; + std::vector log_ratio; + for (int i = 0; i < ratio.size(); i++) + log_ratio.push_back(std::log(ratio[i])); + std::default_random_engine engine(seed); + std::uniform_real_distribution dist_scale(scale[0], scale[1]); + std::uniform_real_distribution dist_log_ratio(log_ratio[0], + log_ratio[1]); + + for (int i = 0; i < num_attempts; i++) { + target_area = dist_scale(engine) * area; + aspect_ratio = std::exp(dist_log_ratio(engine)); + + *crop_w = + static_cast(std::round(std::sqrt(target_area * aspect_ratio))); + *crop_h = + static_cast(std::round(std::sqrt(target_area / aspect_ratio))); + if (*crop_w > 0 && *crop_w <= width && *crop_h > 0 && *crop_h <= height) { + std::uniform_int_distribution dist_crop_h(0, height - *crop_h); + *idx_h = dist_crop_h(engine); + std::uniform_int_distribution dist_crop_w(0, width - *crop_w); + *idx_w = dist_crop_w(engine); + return; + } + } + + // Fallback to central crop + float in_ratio = static_cast(width) / static_cast(height); + float min_ratio = ratio[0] > ratio[1] ? ratio[1] : ratio[0]; + float max_ratio = ratio[0] > ratio[1] ? ratio[0] : ratio[1]; + if (in_ratio < min_ratio) { + *crop_w = width; + *crop_h = static_cast(std::round(*crop_w / min_ratio)); + } else if (in_ratio > max_ratio) { + *crop_h = height; + *crop_w = static_cast(std::round(*crop_h * max_ratio)); + } else { + // return whole image + *crop_h = height; + *crop_w = width; + } + *idx_h = (height - *crop_h) / 2; + *idx_w = (width - *crop_w) / 2; +} + +template +class RandomCropAndResizeCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::NotFound("This kernel only runs on GPU device.")); + // get input, output + // auto& x = ctx.MultiInput("X"); + auto* x = ctx.Input("X"); + PADDLE_ENFORCE_GT(x->size(), 0, + platform::errors::InvalidArgument( + "The size of X must be greater than 0.")); + auto* out = ctx.Output("Out"); + // get size, scale, ratio + auto size = ctx.Attr>("size"); + auto scale = ctx.Attr>("scale"); + auto ratio = ctx.Attr>("ratio"); + // get random seed + int seed = ctx.Attr("seed"); + // get data_layout + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + // get interpolation method + const std::string interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + auto* img = &x->at(0); + int img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w; + for (int i = 0; i < x->size(); i++) { + img = &x->at(i); + img_h = + data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; + img_w = + data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; + img_c = + data_layout == DataLayout::kNCHW ? img->dims()[0] : img->dims()[2]; + GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, + &crop_w, seed); + + auto out_tensor = out->Slice(i, i + 1); + RandomCropAndResizeFwd(ctx, *img, &out_tensor, size, interp_method, + align_corners, align_mode, img_h, img_w, img_c, + idx_h, idx_w, crop_h, crop_w, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(random_crop_and_resize, + ops::RandomCropAndResizeCUDAKernel, + ops::RandomCropAndResizeCUDAKernel); diff --git a/paddle/fluid/operators/random_crop_and_resize_op.h b/paddle/fluid/operators/random_crop_and_resize_op.h new file mode 100644 index 00000000000000..820fc18043770c --- /dev/null +++ b/paddle/fluid/operators/random_crop_and_resize_op.h @@ -0,0 +1,28 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#endif diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 4a937d6aa9b9af..cee0e53143b53f 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -36,6 +36,7 @@ 'PSRoIPool', 'roi_align', 'RoIAlign', + 'random_crop_and_resize', ] @@ -1404,3 +1405,95 @@ def forward(self, x, boxes, boxes_num, aligned=True): output_size=self._output_size, spatial_scale=self._spatial_scale, aligned=aligned) + + +def random_crop_and_resize(x, + size, + scale=(0.08, 1.0), + ratio=(3. / 4., 4. / 3.), + interp_method='bilinear', + align_corners=True, + align_mode=1, + data_layout='NCHW', + seed=0, + name=None): + """ + This operator implements the paddle.vision.transforms.RandomResizedCrop. + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/vision/transforms/RandomResizedCrop_cn.html#randomresizedcrop + for details. This operator has only a GPU kernel. + + Args: + x (List[Tensor]): A list of input images, 3D-Tensor with the shape + of [C,H,W] or [H,W,c]. The data type is uint8 or float32. + size (int|list|tuple): Target size of output image, + with (height, width) shape. + scale (list|tuple): Scale range of the cropped image before resizing, + relatively to the origin image. Default: (0.08, 1.0) + ratio (list|tuple): Range of aspect ratio of the origin aspect ratio + cropped. Default: (0.75, 1.33) + interp_method (str, optional): Interpolation method. Default: 'bilinear'. + support method are as following: + - "nearest", + - "bilinear" + align_corners (bool, optional): If True, the centers of 4 corner pixels + of the input and output tensors are aligned, preserving the values + at the corner pixels, If False, are not aligned. Default: True + align_mode (int32, optional): Optional for bilinear interpolation, + can be 0 for src_idx = scale*(dst_indx+0.5)-0.5, can be 1 for + src_idx = scale*dst_index. Default: 1 + data_layout (str, optional): Only used in an optional string + from: NHWC, NCHW. Specify that the data format of the input + and output data is channel_first or channel_last. Default: NCHW + seed (int, optional): The random seed. Default: 0 + name(str, optional): For detailed information, please refer to : + ref:`api_guide_Name`. Usually name is no need to set and None by + default. + + Returns: + Tensor: The output of RandomCropAndResizeOp is a 4-D tensor with shape + (batch_size, channels, h, w). The data type is uint8 or float32. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import random_crop_and_resize + + data = paddle.rand([3, 256, 256]) + out = random_crop_and_resize([data]) + """ + check_type(size, 'size', (int, tuple), 'random_crop_and_resize') + check_type(scale, 'scale', (list, tuple), 'random_crop_and_resize') + check_type(ratio, 'ratio', (list, tuple), 'random_crop_and_resize') + assert interp_method in ['bilinear', 'nearest'] + assert data_layout in ['NCHW', 'NHWC'] + if isinstance(size, int): + size = (size, size) + + if in_dygraph_mode(): + out = _C_ops.random_crop_and_resize( + x, "size", size, "scale", scale, "ratio", ratio, "interp_method", + interp_method, "align_corners", align_corners, "align_mode", + align_mode, "data_layout", data_layout, "seed", seed) + return out + + helper = LayerHelper('random_crop_and_resize', **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + inputs = {"X": x} + attrs = { + "size": size, + "scale": scale, + "ratio": ratio, + "interp_method": interp_method, + "align_corners": align_corners, + "align_mode": align_mode, + "data_layout": data_layout, + "seed": seed, + } + helper.append_op( + type="random_crop_and_resize", + inputs=inputs, + outputs={"Out": out}, + attrs=attrs) + return out From 1080a73d92ea93c939e1bfe38dbdd8ab3c4dbcc0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 12 Nov 2021 03:04:27 +0000 Subject: [PATCH 13/95] fix compile error --- paddle/fluid/operators/random_crop_and_resize_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/random_crop_and_resize_op.cc index 143cafc8ed8d3c..4bb12506c1f580 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/random_crop_and_resize_op.cc @@ -53,7 +53,7 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { } framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { if (var_name == "X") { return expected_kernel_type; From 7e1da1e37183c2c80d69f482135f7cde560559d2 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Fri, 12 Nov 2021 03:01:10 +0000 Subject: [PATCH 14/95] output tensor --- paddle/fluid/operators/decode_op.cu | 8 ++++---- paddle/fluid/operators/file_label_reader_op.cc | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu index bc0a464254a7b0..07ba2ff3037e0d 100644 --- a/paddle/fluid/operators/decode_op.cu +++ b/paddle/fluid/operators/decode_op.cu @@ -64,10 +64,10 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { auto* out = ctx.OutputVar("Out"); auto& out_array = *out->GetMutable(); out_array.resize(ins->size()); - std::cout << "decode: " << ins->size() << std::endl; + // std::cout << "decode: " << ins->size() << std::endl; for (int i = 0; i < ins->size(); i++) { const framework::LoDTensor x = ins->at(i); - framework::LoDTensor out = out_array.at(i); + // framework::LoDTensor out = out_array.at(i); int components; nvjpegChromaSubsampling_t subsampling; int widths[NVJPEG_MAX_COMPONENT]; @@ -126,9 +126,9 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { // auto* out = ctx.Output("Out"); std::vector out_shape = {output_components, height, width}; - out.Resize(framework::make_ddim(out_shape)); + out_array.at(i).Resize(framework::make_ddim(out_shape)); - T* data = out.mutable_data(ctx.GetPlace()); + T* data = out_array.at(i).mutable_data(ctx.GetPlace()); for (int c = 0; c < output_components; c++) { out_image.channel[c] = data + c * sz; diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 0094d2b34c3813..286557012c2e72 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -95,8 +95,8 @@ class FileDataReader { std::vector labels = ctx.Attr>("labels"); rank_ = ctx.Attr("rank"); world_size_ = ctx.Attr("world_size"); - std::cout << "files and labels size: " << files.size() << " " - << labels.size() << std::endl; + // std::cout << "files and labels size: " << files.size() << " " + // << labels.size() << std::endl; batch_size_ = ctx.Attr("batch_size"); current_epoch_ = 0; current_iter_ = 0; @@ -157,7 +157,7 @@ class FileDataReader { } bool LoadBatch() { - std::cout << "start LoadBatch 0.01" << std::endl; + // std::cout << "start LoadBatch 0.01" << std::endl; LoDTensorArray batch_data = std::move(Read()); return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; } From ef9b44fd50e3e414c33438c654e6845472842ef2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 15 Nov 2021 07:37:33 +0000 Subject: [PATCH 15/95] add debug log --- paddle/fluid/operators/data/dataloader_op.h | 3 ++ paddle/fluid/operators/data/pipeline.cc | 2 ++ paddle/fluid/operators/decode_op.cu | 2 ++ .../fluid/operators/file_label_reader_op.cc | 4 +++ .../operators/random_crop_and_resize_op.cc | 14 ++++----- .../operators/random_crop_and_resize_op.cu | 29 ++++++++++++------- python/paddle/fluid/dataloader/pipeline.py | 6 ++++ 7 files changed, 43 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index d2d52d9150969a..4e5c1b2541d07a 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -22,6 +22,7 @@ template class DataLoaderOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "DataLoaderOpKernel enter"; // Step1: get output vars and attrs auto output_vars = ctx.MultiOutputVar("Out"); auto output_var_names = ctx.OutputNames("Out"); @@ -37,7 +38,9 @@ class DataLoaderOpKernel : public framework::OpKernel { program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, output_var_names, prefetch_depth); + LOG(ERROR) << "Get Pipeline finsih"; pipeline->ReadNext(output_vars); + LOG(ERROR) << "ReadNext finish"; } }; diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 69a81133395518..bc67fc3e01c3e7 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -68,6 +68,7 @@ void Pipeline::StartPrefetchThread(std::shared_ptr executor, const std::vector &skip_vars) { thread_pool_.enqueue([this, executor, skip_vars]() -> void { while (!closed_.load()) { + LOG(ERROR) << "Executor run a iter start"; // Step1: run ops by executor without fetch executor->RunWithoutFetch(skip_vars); @@ -92,6 +93,7 @@ void Pipeline::StartPrefetchThread(std::shared_ptr executor, // Step3: put LoDTensorArray to prefetch blocking_queue prefetch_queue_.Push(t_arr); + LOG(ERROR) << "Executor run a iter finish"; } }); } diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu index 07ba2ff3037e0d..fc1a55a31e49f2 100644 --- a/paddle/fluid/operators/decode_op.cu +++ b/paddle/fluid/operators/decode_op.cu @@ -37,6 +37,7 @@ template class GPUBatchDecodeJpegKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start"; // Create nvJPEG handle if (batch_nvjpeg_handle == nullptr) { nvjpegStatus_t create_status = @@ -139,6 +140,7 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, &out_image, batch_nvjpeg_stream); } + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 286557012c2e72..31a08affcb44cb 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -144,6 +144,8 @@ class FileDataReader { ret.reserve(batch_size_); int start_index = GetStartIndex(); for (int32_t i = start_index; i < start_index + batch_size_; ++i) { + // FIXME + i %= image_label_pairs_.size(); framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); ret.push_back(std::move(tmp)); } @@ -217,6 +219,7 @@ class FileLabelReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + LOG(ERROR) << "FileLabelReaderOp RunImpl start"; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); @@ -232,6 +235,7 @@ class FileLabelReaderOp : public framework::OperatorBase { for (size_t i = 0; i < samples.size(); ++i) { copy_tensor(samples[i], &out_array[i]); } + LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; } void copy_tensor(const framework::LoDTensor& lod_tensor, diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/random_crop_and_resize_op.cc index 4bb12506c1f580..1bd8d481808b2f 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/random_crop_and_resize_op.cc @@ -27,7 +27,7 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "RandomCropAndResize"); - auto size = ctx->Attrs().Get>("size"); + auto size = ctx->Attrs().Get>("size"); PADDLE_ENFORCE_EQ(size.size(), 2, platform::errors::InvalidArgument( "The length of Attrs(size) should be 2.")); @@ -39,11 +39,11 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "w in Attr(size) of Op(RandomCropAndResize) " "should be greater than 0.")); - auto x_dim = ctx->GetInputsDim("X"); // NCHW format - - std::vector out_dim = {static_cast(x_dim.size()), - x_dim[0][0], size[0], size[1]}; - ctx->SetOutputDim("Out", framework::make_ddim({out_dim})); + // auto x_dim = ctx->GetInputsDim("X"); // NCHW format + // + // std::vector out_dim = {static_cast(x_dim.size()), + // x_dim[0][0], size[0], size[1]}; + // ctx->SetOutputDim("Out", framework::make_ddim({out_dim})); } framework::OpKernelType GetExpectedKernelType( @@ -68,7 +68,7 @@ class RandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); AddOutput("Out", "(Tensor). The cropped instance batch."); - AddAttr>( + AddAttr>( "size", "expected output size of the crop, for each edge."); AddAttr>( "scale", diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu index 3381d9f2d2703c..1aaf347af7b9ed 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -using framework::Tensor; +using framework::LoDTensor; using DataLayout = framework::DataLayout; template @@ -162,14 +162,14 @@ __global__ void KeBilinearInterpFw( template static void RandomCropAndResizeFwd( const framework::ExecutionContext& ctx, const framework::LoDTensor& input, - framework::Tensor* output, const std::vector out_size, + framework::Tensor* output, const std::vector out_size, const std::string interp_method, const bool align_corners, const int align_mode, const int img_h, const int img_w, const int c, const int idx_h, const int idx_w, const int crop_h, const int crop_w, const DataLayout data_layout) { auto input_data = input.template data(); - int out_h = out_size[0]; - int out_w = out_size[1]; + int out_h = static_cast(out_size[0]); + int out_w = static_cast(out_size[1]); framework::DDim dim_out; if (data_layout == DataLayout::kNCHW) { @@ -177,7 +177,8 @@ static void RandomCropAndResizeFwd( } else { dim_out = {out_h, out_w, c}; } - auto output_data = output->template mutable_data(ctx.GetPlace()); + // auto output_data = output->template mutable_data(ctx.GetPlace()); + auto output_data = output->data(); if (img_h == crop_h && img_w == crop_w) { framework::TensorCopy(input, ctx.GetPlace(), output); @@ -271,6 +272,7 @@ template class RandomCropAndResizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute start"; PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); @@ -280,9 +282,9 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_GT(x->size(), 0, platform::errors::InvalidArgument( "The size of X must be greater than 0.")); - auto* out = ctx.Output("Out"); + auto* out = ctx.Output("Out"); // get size, scale, ratio - auto size = ctx.Attr>("size"); + auto size = ctx.Attr>("size"); auto scale = ctx.Attr>("scale"); auto ratio = ctx.Attr>("ratio"); // get random seed @@ -297,15 +299,21 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { int align_mode = ctx.Attr("align_mode"); auto* img = &x->at(0); - int img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w; + int64_t img_c = data_layout == DataLayout::kNCHW ? \ + img->dims()[0] : img->dims()[2]; + + std::vector out_dim = {static_cast(x->size()), + img_c, size[0], size[1]}; + out->Resize(framework::make_ddim(out_dim)); + out->mutable_data(ctx.GetPlace()); + + int img_h, img_w, idx_h, idx_w, crop_h, crop_w; for (int i = 0; i < x->size(); i++) { img = &x->at(i); img_h = data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; - img_c = - data_layout == DataLayout::kNCHW ? img->dims()[0] : img->dims()[2]; GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, &crop_w, seed); @@ -314,6 +322,7 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { align_corners, align_mode, img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w, data_layout); } + LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute finish"; } }; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 08136ee3000485..9d291ee4ac7b6b 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -104,7 +104,13 @@ def __next__(self): self._output_vars = self._prepare_output_vars() # try: + import sys + import time + tic = time.time() _C_ops.dataloader(self._output_vars, *self._attrs) + toc = time.time() + print("_C_ops calling cost {}ms".format(toc - tic)) + sys.stdout.flush() # except KeyboardInterrupt: # pass From 1d94fb99b044213edf042103e5e99257579a25bb Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Tue, 16 Nov 2021 03:31:14 +0000 Subject: [PATCH 16/95] add threads pool --- paddle/fluid/operators/decode_op.cc | 1 + paddle/fluid/operators/decode_op.cu | 158 ++++++++++-------- .../fluid/operators/file_label_reader_op.cc | 8 +- 3 files changed, 98 insertions(+), 69 deletions(-) diff --git a/paddle/fluid/operators/decode_op.cc b/paddle/fluid/operators/decode_op.cc index e2ca1d20357271..0c4dee14399e46 100644 --- a/paddle/fluid/operators/decode_op.cc +++ b/paddle/fluid/operators/decode_op.cc @@ -99,6 +99,7 @@ or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 between 0 and 255. )DOC"); + AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); AddAttr( "mode", "(string, default \"unchanged\"), The read mode used " diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu index fc1a55a31e49f2..b83322ea5797e4 100644 --- a/paddle/fluid/operators/decode_op.cu +++ b/paddle/fluid/operators/decode_op.cu @@ -14,6 +14,7 @@ #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/dynload/nvjpeg.h" @@ -23,8 +24,9 @@ namespace paddle { namespace operators { -static cudaStream_t batch_nvjpeg_stream = nullptr; +static std::vector nvjpeg_streams; static nvjpegHandle_t batch_nvjpeg_handle = nullptr; +static std::unique_ptr<::ThreadPool> pool_; void batch_InitNvjpegImage(nvjpegImage_t* img) { for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { @@ -38,6 +40,8 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start"; + int num_threads_ = ctx.Attr("num_threads"); + auto mode = ctx.Attr("mode"); // Create nvJPEG handle if (batch_nvjpeg_handle == nullptr) { nvjpegStatus_t create_status = @@ -46,56 +50,77 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, platform::errors::Fatal("nvjpegCreateSimple failed: ", create_status)); - } - nvjpegJpegState_t nvjpeg_state; - nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( - batch_nvjpeg_handle, &nvjpeg_state); + nvjpeg_streams.reserve(num_threads_); - PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, - platform::errors::Fatal("nvjpegJpegStateCreate failed: ", - state_status)); + for (int i = 0; i < num_threads_; i++) { + cudaStreamCreateWithFlags(&nvjpeg_streams[i], cudaStreamNonBlocking); + } + } + + pool_.reset(new ::ThreadPool(num_threads_)); const framework::LoDTensorArray* ins = ctx.Input("X"); - // auto* outs = ctx.Output("Out"); - // auto *out = scope.FindVar(Output("Out")); - // std::cout << "debug 0.05" << std::endl; - // auto &out_array = outs->GetMutable(); + auto* out = ctx.OutputVar("Out"); auto& out_array = *out->GetMutable(); out_array.resize(ins->size()); - // std::cout << "decode: " << ins->size() << std::endl; - for (int i = 0; i < ins->size(); i++) { - const framework::LoDTensor x = ins->at(i); - // framework::LoDTensor out = out_array.at(i); - int components; - nvjpegChromaSubsampling_t subsampling; - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - - auto* x_data = x.data(); - - nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( - batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, - &subsampling, widths, heights); - - PADDLE_ENFORCE_EQ( - info_status, NVJPEG_STATUS_SUCCESS, - platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status)); - int width = widths[0]; - int height = heights[0]; + std::vector> tasks(ins->size()); - nvjpegOutputFormat_t output_format; - int output_components; - - auto mode = ctx.Attr("mode"); - if (mode == "unchanged") { - if (components == 1) { + auto dev = ctx.GetPlace(); + for (int i = 0; i < ins->size(); i++) { + auto nvjpeg_stream = nvjpeg_streams[i % num_threads_]; + auto nvjpeg_handle = batch_nvjpeg_handle; + tasks[i] = pool_->enqueue([this, i, ins, &out_array, mode, nvjpeg_handle, + nvjpeg_stream, dev]() -> int { + nvjpegJpegState_t nvjpeg_state; + nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( + batch_nvjpeg_handle, &nvjpeg_state); + + PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal( + "nvjpegJpegStateCreate failed: ", state_status)); + const framework::LoDTensor x = ins->at(i); + // framework::LoDTensor out = out_array.at(i); + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + auto* x_data = x.data(); + + nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, + &subsampling, widths, heights); + + PADDLE_ENFORCE_EQ(info_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegGetImageInfo failed: ", + info_status)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + if (mode == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode == "gray") { output_format = NVJPEG_OUTPUT_Y; output_components = 1; - } else if (components == 3) { + } else if (mode == "rgb") { output_format = NVJPEG_OUTPUT_RGB; output_components = 3; } else { @@ -103,42 +128,39 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { PADDLE_THROW(platform::errors::Fatal( "The provided mode is not supported for JPEG files on GPU")); } - } else if (mode == "gray") { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (mode == "rgb") { - output_format = NVJPEG_OUTPUT_RGB; - output_components = 3; - } else { - platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); - } - nvjpegImage_t out_image; - batch_InitNvjpegImage(&out_image); + nvjpegImage_t out_image; + batch_InitNvjpegImage(&out_image); - // create nvjpeg stream - if (batch_nvjpeg_stream == nullptr) { - cudaStreamCreateWithFlags(&batch_nvjpeg_stream, cudaStreamNonBlocking); - } + // create nvjpeg stream + // if (batch_nvjpeg_stream == nullptr) { + // cudaStreamCreateWithFlags(&batch_nvjpeg_stream, + // cudaStreamNonBlocking); + // } - int sz = widths[0] * heights[0]; + int sz = widths[0] * heights[0]; - // auto* out = ctx.Output("Out"); - std::vector out_shape = {output_components, height, width}; - out_array.at(i).Resize(framework::make_ddim(out_shape)); + // auto* out = ctx.Output("Out"); + std::vector out_shape = {output_components, height, width}; + out_array.at(i).Resize(framework::make_ddim(out_shape)); - T* data = out_array.at(i).mutable_data(ctx.GetPlace()); + uint8_t* data = out_array.at(i).mutable_data(dev); - for (int c = 0; c < output_components; c++) { - out_image.channel[c] = data + c * sz; - out_image.pitch[c] = width; - } + for (int c = 0; c < output_components; c++) { + out_image.channel[c] = data + c * sz; + out_image.pitch[c] = width; + } + + nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, + &out_image, nvjpeg_stream); + // std:: cout << "task read ok: " << i << std:: endl; + return 0; + }); + } - nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( - batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, - &out_image, batch_nvjpeg_stream); + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); } LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 31a08affcb44cb..9d90717d0624e5 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -100,6 +100,7 @@ class FileDataReader { batch_size_ = ctx.Attr("batch_size"); current_epoch_ = 0; current_iter_ = 0; + iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); is_closed_ = false; for (int i = 0, n = files.size(); i < n; i++) image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); @@ -107,7 +108,11 @@ class FileDataReader { } int GetStartIndex() { - return batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; + int start_idx = + batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + + rank_ * batch_size_; + current_iter_++; + return start_idx; } framework::LoDTensor ReadSample(const std::string filename) { @@ -172,6 +177,7 @@ class FileDataReader { int current_iter_; int rank_; int world_size_; + int iters_per_epoch_; std::atomic is_closed_; Buffer batch_buffer_; std::thread load_thrd_; From 5c1316a7a4a7a53d869abb6dd1b00e4f7af0dd37 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 21 Nov 2021 11:40:43 +0000 Subject: [PATCH 17/95] multi-phrase nvjpeg decode single thread success --- paddle/fluid/operators/decode_op.cu | 245 +++++++++++---------- paddle/fluid/operators/nvjpeg_decoder.h | 228 +++++++++++++++++++ paddle/fluid/platform/dynload/nvjpeg.h | 30 ++- paddle/fluid/platform/enforce.h | 39 ++++ paddle/fluid/platform/external_error.proto | 3 +- python/paddle/fluid/dataloader/pipeline.py | 2 +- python/paddle/vision/ops.py | 4 +- 7 files changed, 431 insertions(+), 120 deletions(-) create mode 100644 paddle/fluid/operators/nvjpeg_decoder.h diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu index b83322ea5797e4..72597a566ae380 100644 --- a/paddle/fluid/operators/decode_op.cu +++ b/paddle/fluid/operators/decode_op.cu @@ -20,6 +20,7 @@ #include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/stream/cuda_stream.h" +#include "paddle/fluid/operators/nvjpeg_decoder.h" namespace paddle { namespace operators { @@ -28,6 +29,8 @@ static std::vector nvjpeg_streams; static nvjpegHandle_t batch_nvjpeg_handle = nullptr; static std::unique_ptr<::ThreadPool> pool_; +static NvjpegDecoder* nvjpeg_decoder = nullptr; + void batch_InitNvjpegImage(nvjpegImage_t* img) { for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { img->channel[c] = nullptr; @@ -42,126 +45,146 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start"; int num_threads_ = ctx.Attr("num_threads"); auto mode = ctx.Attr("mode"); - // Create nvJPEG handle - if (batch_nvjpeg_handle == nullptr) { - nvjpegStatus_t create_status = - platform::dynload::nvjpegCreateSimple(&batch_nvjpeg_handle); - - PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, - platform::errors::Fatal("nvjpegCreateSimple failed: ", - create_status)); - - nvjpeg_streams.reserve(num_threads_); - - for (int i = 0; i < num_threads_; i++) { - cudaStreamCreateWithFlags(&nvjpeg_streams[i], cudaStreamNonBlocking); - } + + // multi-phrase decode + if (!nvjpeg_decoder) { + nvjpeg_decoder = new NvjpegDecoder(mode); } - pool_.reset(new ::ThreadPool(num_threads_)); - - const framework::LoDTensorArray* ins = + const framework::LoDTensorArray* inputs = ctx.Input("X"); auto* out = ctx.OutputVar("Out"); auto& out_array = *out->GetMutable(); - out_array.resize(ins->size()); - - std::vector> tasks(ins->size()); - - auto dev = ctx.GetPlace(); - for (int i = 0; i < ins->size(); i++) { - auto nvjpeg_stream = nvjpeg_streams[i % num_threads_]; - auto nvjpeg_handle = batch_nvjpeg_handle; - tasks[i] = pool_->enqueue([this, i, ins, &out_array, mode, nvjpeg_handle, - nvjpeg_stream, dev]() -> int { - nvjpegJpegState_t nvjpeg_state; - nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( - batch_nvjpeg_handle, &nvjpeg_state); - - PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, - platform::errors::Fatal( - "nvjpegJpegStateCreate failed: ", state_status)); - const framework::LoDTensor x = ins->at(i); - // framework::LoDTensor out = out_array.at(i); - int components; - nvjpegChromaSubsampling_t subsampling; - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - - auto* x_data = x.data(); - - nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( - batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, - &subsampling, widths, heights); - - PADDLE_ENFORCE_EQ(info_status, NVJPEG_STATUS_SUCCESS, - platform::errors::Fatal("nvjpegGetImageInfo failed: ", - info_status)); + out_array.resize(inputs->size()); - int width = widths[0]; - int height = heights[0]; - - nvjpegOutputFormat_t output_format; - int output_components; - - if (mode == "unchanged") { - if (components == 1) { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (components == 3) { - output_format = NVJPEG_OUTPUT_RGB; - output_components = 3; - } else { - platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); - } - } else if (mode == "gray") { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (mode == "rgb") { - output_format = NVJPEG_OUTPUT_RGB; - output_components = 3; - } else { - platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); - } - - nvjpegImage_t out_image; - batch_InitNvjpegImage(&out_image); - - // create nvjpeg stream - // if (batch_nvjpeg_stream == nullptr) { - // cudaStreamCreateWithFlags(&batch_nvjpeg_stream, - // cudaStreamNonBlocking); - // } - - int sz = widths[0] * heights[0]; - - // auto* out = ctx.Output("Out"); - std::vector out_shape = {output_components, height, width}; - out_array.at(i).Resize(framework::make_ddim(out_shape)); - - uint8_t* data = out_array.at(i).mutable_data(dev); - - for (int c = 0; c < output_components; c++) { - out_image.channel[c] = data + c * sz; - out_image.pitch[c] = width; - } - - nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( - batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, - &out_image, nvjpeg_stream); - // std:: cout << "task read ok: " << i << std:: endl; - return 0; - }); - } + for (size_t i = 0; i < inputs->size(); i++) { + const framework::LoDTensor x = inputs->at(i); + auto* x_data = x.data(); - for (size_t i = 0; i < tasks.size(); ++i) { - tasks[i].wait(); + nvjpeg_decoder->Run(x_data, static_cast(x.numel()), + &out_array[i], ctx); } + // // Create nvJPEG handle + // if (batch_nvjpeg_handle == nullptr) { + // nvjpegStatus_t create_status = + // platform::dynload::nvjpegCreateSimple(&batch_nvjpeg_handle); + // + // PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, + // platform::errors::Fatal("nvjpegCreateSimple failed: ", + // create_status)); + // + // nvjpeg_streams.reserve(num_threads_); + // + // for (int i = 0; i < num_threads_; i++) { + // cudaStreamCreateWithFlags(&nvjpeg_streams[i], cudaStreamNonBlocking); + // } + // pool_.reset(new ::ThreadPool(num_threads_)); + // } + // + // const framework::LoDTensorArray* ins = + // ctx.Input("X"); + // + // auto* out = ctx.OutputVar("Out"); + // auto& out_array = *out->GetMutable(); + // out_array.resize(ins->size()); + // + // std::vector> tasks(ins->size()); + // + // auto dev = ctx.GetPlace(); + // for (int i = 0; i < ins->size(); i++) { + // auto nvjpeg_stream = nvjpeg_streams[i % num_threads_]; + // auto nvjpeg_handle = batch_nvjpeg_handle; + // tasks[i] = pool_->enqueue([this, i, ins, &out_array, mode, nvjpeg_handle, + // nvjpeg_stream, dev]() -> int { + // nvjpegJpegState_t nvjpeg_state; + // nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( + // batch_nvjpeg_handle, &nvjpeg_state); + // + // PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + // platform::errors::Fatal( + // "nvjpegJpegStateCreate failed: ", state_status)); + // const framework::LoDTensor x = ins->at(i); + // // framework::LoDTensor out = out_array.at(i); + // int components; + // nvjpegChromaSubsampling_t subsampling; + // int widths[NVJPEG_MAX_COMPONENT]; + // int heights[NVJPEG_MAX_COMPONENT]; + // + // auto* x_data = x.data(); + // + // nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + // batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, + // &subsampling, widths, heights); + // + // PADDLE_ENFORCE_EQ(info_status, NVJPEG_STATUS_SUCCESS, + // platform::errors::Fatal("nvjpegGetImageInfo failed: ", + // info_status)); + // + // int width = widths[0]; + // int height = heights[0]; + // + // nvjpegOutputFormat_t output_format; + // int output_components; + // + // if (mode == "unchanged") { + // if (components == 1) { + // output_format = NVJPEG_OUTPUT_Y; + // output_components = 1; + // } else if (components == 3) { + // output_format = NVJPEG_OUTPUT_RGB; + // output_components = 3; + // } else { + // platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + // PADDLE_THROW(platform::errors::Fatal( + // "The provided mode is not supported for JPEG files on GPU")); + // } + // } else if (mode == "gray") { + // output_format = NVJPEG_OUTPUT_Y; + // output_components = 1; + // } else if (mode == "rgb") { + // output_format = NVJPEG_OUTPUT_RGB; + // output_components = 3; + // } else { + // platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + // PADDLE_THROW(platform::errors::Fatal( + // "The provided mode is not supported for JPEG files on GPU")); + // } + // + // nvjpegImage_t out_image; + // batch_InitNvjpegImage(&out_image); + // + // // create nvjpeg stream + // // if (batch_nvjpeg_stream == nullptr) { + // // cudaStreamCreateWithFlags(&batch_nvjpeg_stream, + // // cudaStreamNonBlocking); + // // } + // + // int sz = widths[0] * heights[0]; + // + // // auto* out = ctx.Output("Out"); + // std::vector out_shape = {output_components, height, width}; + // out_array.at(i).Resize(framework::make_ddim(out_shape)); + // + // uint8_t* data = out_array.at(i).mutable_data(dev); + // // transfer and decode to device buffer + // + // for (int c = 0; c < output_components; c++) { + // out_image.channel[c] = data + c * sz; + // out_image.pitch[c] = width; + // } + // + // nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + // batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, + // &out_image, nvjpeg_stream); + // // std:: cout << "task read ok: " << i << std:: endl; + // return 0; + // }); + // } + // + // for (size_t i = 0; i < tasks.size(); ++i) { + // tasks[i].wait(); + // } LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/nvjpeg_decoder.h b/paddle/fluid/operators/nvjpeg_decoder.h new file mode 100644 index 00000000000000..356d056645ef51 --- /dev/null +++ b/paddle/fluid/operators/nvjpeg_decoder.h @@ -0,0 +1,228 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +static int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); } +static int dev_free(void *p) { return (int)cudaFree(p); } + +static int host_malloc(void** p, size_t s, unsigned int f) { + return (int)cudaHostAlloc(p, s, f); +} + +static int host_free(void* p) { return (int)cudaFreeHost(p); } + +class NvjpegDecoder { + public: + NvjpegDecoder(std::string mode) + : nvjpeg_streams_(2), + page_id_(0), + pinned_buffers_(2), + mode_(mode) { + // create cuda stream + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking)); + + // create nvjpeg handle and stream + // device_allocator_.dev_malloc = &cudaMalloc; + // device_allocator_.dev_free = &cudaFree; + // pinned_allocator_.pinned_malloc = &cudaMallocHost; + // pinned_allocator_.pinned_free = &cudaFreeHost; + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &device_allocator_, + &pinned_allocator_, 0, &handle_)); + for (size_t i; i < nvjpeg_streams_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); + } + + // create decode params, decoder and state + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_DEFAULT, &decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); + + // create device & pinned buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(handle_, &device_allocator_, &device_buffer_)); + for (size_t i = 0; i < pinned_buffers_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(handle_, &pinned_allocator_, &pinned_buffers_[i])); + } + } + + ~NvjpegDecoder() { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); + + // destroy nvjpeg streams + for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); + } + + // destroy decode params, decoder and state + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsDestroy(decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderDestroy(decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStateDestroy(state_)); + + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_)); + for (size_t i = 0; i < pinned_buffers_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); + } + + // destroy nvjpeg handle and cuda stream at last + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); + } + + void ParseOutputInfo( + const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, + nvjpegImage_t* out_image, framework::ExecutionContext ctx) { + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, + &components, &subsampling, widths, heights)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + if (mode_ == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode_ == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode_ == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); + + std::vector out_shape = {output_components, height, width}; + out->Resize(framework::make_ddim(out_shape)); + + // allocate memory and assign to out_image + auto* data = out->mutable_data(ctx.GetPlace()); + for (int c = 0; c < output_components; c++) { + out_image->channel[c] = data + c * width * height; + out_image->pitch[c] = width; + } + } + + void Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { + auto buffer = pinned_buffers_[page_id_]; + auto stream = nvjpeg_streams_[page_id_]; + page_id_ ^= 1; + + // decode jpeg in host to pinned buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); + + // transfer and decode to device buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, cuda_stream_)); + + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); + } + + void Run(const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, const framework::ExecutionContext& ctx) { + nvjpegImage_t image; + ParseOutputInfo(bit_stream, bit_len, out, &image, ctx); + Decode(bit_stream, bit_len, &image); + } + + private: + DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); + + cudaStream_t cuda_stream_ = nullptr; + std::vector nvjpeg_streams_; + + nvjpegHandle_t handle_ = nullptr; + nvjpegJpegState_t state_ = nullptr; + nvjpegJpegDecoder_t decoder_ = nullptr; + nvjpegDecodeParams_t decode_params_ = nullptr; + + nvjpegPinnedAllocator_t pinned_allocator_ = {&host_malloc, &host_free}; + nvjpegDevAllocator_t device_allocator_ = {&dev_malloc, &dev_free}; + std::vector pinned_buffers_; + nvjpegBufferDevice_t device_buffer_ = nullptr; + + int page_id_; + + const std::string mode_; +}; + + +// class NvjpegDecoderWorkerPool { +// public: +// NvjpegDecoderWorkerPool(const int num_threads, ) +// +// private: +// DISABLE_COPY_AND_ASSIGN(NvjpegDecoderWorkerPool); +// +// struct NvjpegDecoderTask { +// const uint8_t* bit_stream; +// const size_t bit_len; +// LoDTensor* out; +// } +// +// class NvjpegDecoderWorker { +// public: +// NvjpegDecoderWorker( +// const std::string mode, framework::ExecutionContext ctx, +// const int capacity) +// : mode_(mode), +// ctx_(ctx), +// capacity_(capacity), +// pool_(1) { +// +// } +// +// private: +// const std::string mode_; +// const framework::ExecutionContext ctx_; +// +// BlockingQueue +// ThreadPool pool_; +// } +// +// } + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index ae457b2958f5de..4c5d88b18ddfb6 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -37,11 +37,31 @@ extern void *nvjpeg_dso_handle; }; \ extern DynLoad__##__name __name -#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ - __macro(nvjpegCreateSimple); \ - __macro(nvjpegJpegStateCreate); \ - __macro(nvjpegGetImageInfo); \ - __macro(nvjpegJpegStateDestroy); \ +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegCreateEx); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegJpegStreamCreate); \ + __macro(nvjpegDecodeParamsCreate); \ + __macro(nvjpegDecoderCreate); \ + __macro(nvjpegDecoderStateCreate); \ + __macro(nvjpegBufferDeviceCreate); \ + __macro(nvjpegBufferPinnedCreate); \ + __macro(nvjpegDecodeParamsSetOutputFormat); \ + __macro(nvjpegStateAttachPinnedBuffer); \ + __macro(nvjpegStateAttachDeviceBuffer); \ + __macro(nvjpegJpegStreamParse); \ + __macro(nvjpegDecodeJpegHost); \ + __macro(nvjpegDecodeJpegTransferToDevice); \ + __macro(nvjpegDecodeJpegDevice); \ + __macro(nvjpegJpegStreamDestroy); \ + __macro(nvjpegDecodeParamsDestroy); \ + __macro(nvjpegDecoderDestroy); \ + __macro(nvjpegBufferDeviceDestroy); \ + __macro(nvjpegBufferPinnedDestroy); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDestroy); \ __macro(nvjpegDecode); NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a0e2dd5f7e3963..4ff30fe55740f2 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -79,6 +79,7 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/curand.h" #include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include #include "paddle/fluid/platform/dynload/nccl.h" @@ -714,6 +715,7 @@ DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); +DEFINE_EXTERNAL_API_TYPE(nvjpegStatus_t, NVJPEG_STATUS_SUCCESS, NVJPEG); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); @@ -753,6 +755,8 @@ inline const char* GetErrorMsgUrl(T status) { break; case platform::proto::ApiType::CUFFT: return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult"; + case platform::proto::ApiType::NVJPEG: + return "https://docs.nvidia.com/cuda/nvjpeg/index.html#nvjpeg-api-return-codes"; default: return "Unknown type of External API, can't get error message URL!"; break; @@ -911,6 +915,26 @@ inline std::string build_nvidia_error_msg(cufftResult_t stat) { return sout.str(); } +/**************** NVJPEG ERROR ****************/ +inline bool is_error(nvjpegStatus_t stat) { return stat != NVJPEG_STATUS_SUCCESS; } + +inline std::string get_nvjpeg_error_str(nvjpegStatus_t stat) { + switch (stat) { + case NVJPEG_STATUS_SUCCESS: return "NVJPEG_STATUS_SUCCESS"; + case NVJPEG_STATUS_NOT_INITIALIZED: return "NVJPEG_STATUS_NOT_INITIALIZED"; + case NVJPEG_STATUS_INVALID_PARAMETER: return "NVJPEG_STATUS_INVALID_PARAMETER"; + case NVJPEG_STATUS_BAD_JPEG: return "NVJPEG_STATUS_BAD_JPEG"; + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + case NVJPEG_STATUS_ALLOCATOR_FAILURE: return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + case NVJPEG_STATUS_EXECUTION_FAILED: return "NVJPEG_STATUS_EXECUTION_FAILED"; + case NVJPEG_STATUS_ARCH_MISMATCH: return "NVJPEG_STATUS_ARCH_MISMATCH"; + case NVJPEG_STATUS_INTERNAL_ERROR: return "NVJPEG_STATUS_INTERNAL_ERROR"; + case NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED: + return "NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED"; + } + return "Invalid nvjpeg status code"; +} + /**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { @@ -961,6 +985,21 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { } \ } while (0) +#define PADDLE_ENFORCE_NVJPEG_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __NVJPEG_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::ExternalApiType< \ + __NVJPEG_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + "Nvjpeg failed: %s", \ + ::paddle::platform::get_nvjpeg_error_str(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + inline void retry_sleep(unsigned milliseconds) { #ifdef _WIN32 Sleep(milliseconds); diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto index cbbf803492e64f..57d0d75fbb7d2e 100644 --- a/paddle/fluid/platform/external_error.proto +++ b/paddle/fluid/platform/external_error.proto @@ -25,6 +25,7 @@ enum ApiType { CUSOLVER = 4; NCCL = 5; CUFFT = 6; + NVJPEG = 7; } message MessageDesc { @@ -44,4 +45,4 @@ message AllMessageDesc { message ExternalErrorDesc { // Error messages of different kind of external third party API repeated AllMessageDesc errors = 1; -} \ No newline at end of file +} diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 9d291ee4ac7b6b..a7011647c94127 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -109,7 +109,7 @@ def __next__(self): tic = time.time() _C_ops.dataloader(self._output_vars, *self._attrs) toc = time.time() - print("_C_ops calling cost {}ms".format(toc - tic)) + print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) sys.stdout.flush() # except KeyboardInterrupt: # pass diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index cee0e53143b53f..9bdf0313cb2e10 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -918,7 +918,7 @@ def file_label_reader(file_root, batch_size, name=None): return out -def image_decode(x, mode='unchanged', name=None): +def image_decode(x, mode='unchanged', num_threads=2, name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. @@ -955,7 +955,7 @@ def image_decode(x, mode='unchanged', name=None): return _C_ops.decode(x, "mode", mode) inputs = {'X': x} - attrs = {"mode": mode} + attrs = {"mode": mode, "num_theads": num_threads} helper = LayerHelper("image_decode", **locals()) out = helper.create_variable( From 61c85eecaa513482f0572c30a334bec8f8910a86 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 22 Nov 2021 13:53:14 +0000 Subject: [PATCH 18/95] mult-phrase decode with thread pool --- paddle/fluid/operators/data/map_runner.cc | 2 +- paddle/fluid/operators/decode_op.cc | 3 +- paddle/fluid/operators/decode_op.cu | 48 ++++-- paddle/fluid/operators/nvjpeg_decoder.h | 4 +- .../operators/nvjpeg_decoder_thread_pool.h | 141 ++++++++++++++++++ python/paddle/vision/ops.py | 4 +- 6 files changed, 187 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/operators/nvjpeg_decoder_thread_pool.h diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 233b8d10cb0e4f..07875b5b6fe10b 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -165,7 +165,7 @@ void MapRunner::Shutdown() { } // set running_ as false to exit map thread, then release thread pool - running_ = false; + running_.store(false); // FIXME: ThreadPool doesn't have shutdown method delete &thread_pool_; } diff --git a/paddle/fluid/operators/decode_op.cc b/paddle/fluid/operators/decode_op.cc index 0c4dee14399e46..98552e354057ef 100644 --- a/paddle/fluid/operators/decode_op.cc +++ b/paddle/fluid/operators/decode_op.cc @@ -99,7 +99,8 @@ or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 between 0 and 255. )DOC"); - AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); + AddAttr("num_threads", "Path of the file to be readed.") + .SetDefault(2); AddAttr( "mode", "(string, default \"unchanged\"), The read mode used " diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu index 72597a566ae380..06109d5aae4f23 100644 --- a/paddle/fluid/operators/decode_op.cu +++ b/paddle/fluid/operators/decode_op.cu @@ -20,7 +20,7 @@ #include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/fluid/operators/nvjpeg_decoder.h" +#include "paddle/fluid/operators/nvjpeg_decoder_thread_pool.h" namespace paddle { namespace operators { @@ -29,7 +29,8 @@ static std::vector nvjpeg_streams; static nvjpegHandle_t batch_nvjpeg_handle = nullptr; static std::unique_ptr<::ThreadPool> pool_; -static NvjpegDecoder* nvjpeg_decoder = nullptr; +// static NvjpegDecoder* nvjpeg_decoder = nullptr; +static NvjpegDecoderThreadPool* decode_pool = nullptr; void batch_InitNvjpegImage(nvjpegImage_t* img) { for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { @@ -42,13 +43,13 @@ template class GPUBatchDecodeJpegKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start"; - int num_threads_ = ctx.Attr("num_threads"); + int num_threads = ctx.Attr("num_threads"); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); - // multi-phrase decode - if (!nvjpeg_decoder) { - nvjpeg_decoder = new NvjpegDecoder(mode); + // multi-phrase decode thread pool + if (!decode_pool) { + decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); } const framework::LoDTensorArray* inputs = @@ -61,10 +62,39 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { for (size_t i = 0; i < inputs->size(); i++) { const framework::LoDTensor x = inputs->at(i); auto* x_data = x.data(); + size_t x_numel = static_cast(x.numel()); - nvjpeg_decoder->Run(x_data, static_cast(x.numel()), - &out_array[i], ctx); + NvjpegDecodeWork work = { + .bit_stream = x_data, + .bit_len = x_numel, + .tensor = &out_array[i], + .ctx = ctx + }; + decode_pool->AddWork(std::make_shared(work)); } + + decode_pool->RunAll(true); + + // // multi-phrase decode single thread + // if (!nvjpeg_decoder) { + // nvjpeg_decoder = new NvjpegDecoder(mode); + // } + // + // const framework::LoDTensorArray* inputs = + // ctx.Input("X"); + // + // auto* out = ctx.OutputVar("Out"); + // auto& out_array = *out->GetMutable(); + // out_array.resize(inputs->size()); + // + // for (size_t i = 0; i < inputs->size(); i++) { + // const framework::LoDTensor x = inputs->at(i); + // auto* x_data = x.data(); + // + // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), + // &out_array[i], &ctx); + // } + // // Create nvJPEG handle // if (batch_nvjpeg_handle == nullptr) { // nvjpegStatus_t create_status = diff --git a/paddle/fluid/operators/nvjpeg_decoder.h b/paddle/fluid/operators/nvjpeg_decoder.h index 356d056645ef51..5b9ecf2780dae5 100644 --- a/paddle/fluid/operators/nvjpeg_decoder.h +++ b/paddle/fluid/operators/nvjpeg_decoder.h @@ -92,7 +92,7 @@ class NvjpegDecoder { void ParseOutputInfo( const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, - nvjpegImage_t* out_image, framework::ExecutionContext ctx) { + nvjpegImage_t* out_image, framework::ExecutionContext& ctx) { int components; nvjpegChromaSubsampling_t subsampling; int widths[NVJPEG_MAX_COMPONENT]; @@ -161,7 +161,7 @@ class NvjpegDecoder { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); } - void Run(const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, const framework::ExecutionContext& ctx) { + void Run(const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, framework::ExecutionContext& ctx) { nvjpegImage_t image; ParseOutputInfo(bit_stream, bit_len, out, &image, ctx); Decode(bit_stream, bit_len, &image); diff --git a/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h b/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h new file mode 100644 index 00000000000000..99b18e810f5a73 --- /dev/null +++ b/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h @@ -0,0 +1,141 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/nvjpeg_decoder.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +struct NvjpegDecodeWork { + const uint8_t* bit_stream; + size_t bit_len; + LoDTensor* tensor; + framework::ExecutionContext ctx; +}; + +class NvjpegDecoderThreadPool { + public: + NvjpegDecoderThreadPool(const int num_threads, const std::string mode) + : threads_(num_threads), + mode_(mode), + shutdown_(false), + running_(false), + completed_(false), + outstand_works_(0) { + PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( + "num_threads shoule be a positive interger, " + "but got %d", num_threads)); + for (int i = 0; i < num_threads; i++) { + threads_.emplace_back( + std::thread(std::bind(&NvjpegDecoderThreadPool::ThreadLoop, this, i))); + } + } + + void AddWork(std::shared_ptr work) { + work_queue_.push_back(work); + } + + void RunAll(const bool wait, const bool sort = true) { + // Sort images in length desending order + if (sort) SortWorkByLengthDescend(); + + { + std::lock_guard lock(mutex_); + completed_ = false; + running_ = true; + } + running_cond_.notify_all(); + + if (wait) WaitTillWorksCompleted(); + } + + void WaitTillWorksCompleted() { + std::unique_lock lock(mutex_); + completed_cond_.wait(lock, [this] { return this->completed_; }); + running_ = false; + } + + void Shutdown() { + std::lock_guard lock(mutex_); + + running_ = false; + shutdown_.store(true); + running_cond_.notify_all(); + + work_queue_.clear(); + + for (auto &thread : threads_) { + thread.join(); + } + } + + private: + std::vector threads_; + std::string mode_; + + std::deque> work_queue_; + std::mutex mutex_; + + std::atomic shutdown_; + std::condition_variable running_cond_; + bool running_; + std::condition_variable completed_cond_; + bool completed_; + + int outstand_works_; + + void SortWorkByLengthDescend() { + std::lock_guard lock(mutex_); + std::sort(work_queue_.begin(), work_queue_.end(), + [](const std::shared_ptr a, + const std::shared_ptr b) { + return b->bit_len < a->bit_len; + }); + } + + void ThreadLoop(const int thread_idx) { + NvjpegDecoder* decoder = new NvjpegDecoder(mode_); + + while (!shutdown_.load()) { + std::unique_lock lock(mutex_); + running_cond_.wait(lock, [this] { return running_ && !work_queue_.empty(); }); + if (shutdown_.load()) break; + + auto work = work_queue_.front(); + work_queue_.pop_front(); + outstand_works_++; + lock.unlock(); + + decoder->Run(work->bit_stream, work->bit_len, work->tensor, work->ctx); + + lock.lock(); + outstand_works_--; + if (outstand_works_ == 0 && work_queue_.empty()) { + completed_ = true; + lock.unlock(); + completed_cond_.notify_one(); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 9bdf0313cb2e10..6d17a83a2af791 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -955,9 +955,9 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): return _C_ops.decode(x, "mode", mode) inputs = {'X': x} - attrs = {"mode": mode, "num_theads": num_threads} + attrs = {"mode": mode, "num_threads": num_threads} - helper = LayerHelper("image_decode", **locals()) + helper = LayerHelper("decode", **locals()) out = helper.create_variable( name=unique_name.generate("image_decode"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, From c5043dc7e1c4e70181c92f869093d119642197bc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 23 Nov 2021 07:12:20 +0000 Subject: [PATCH 19/95] polish code --- paddle/fluid/operators/data/CMakeLists.txt | 3 + .../{decode_op.cc => data/batch_decode_op.cc} | 6 +- .../fluid/operators/data/batch_decode_op.cu | 96 +++++++ paddle/fluid/operators/data/nvjpeg_decoder.cc | 246 ++++++++++++++++++ paddle/fluid/operators/data/nvjpeg_decoder.h | 116 +++++++++ .../operators/data/unity_build_rule.cmake | 7 +- paddle/fluid/operators/decode_op.cu | 228 ---------------- paddle/fluid/operators/nvjpeg_decoder.h | 228 ---------------- .../operators/nvjpeg_decoder_thread_pool.h | 141 ---------- python/paddle/vision/ops.py | 4 +- 10 files changed, 472 insertions(+), 603 deletions(-) rename paddle/fluid/operators/{decode_op.cc => data/batch_decode_op.cc} (95%) create mode 100644 paddle/fluid/operators/data/batch_decode_op.cu create mode 100644 paddle/fluid/operators/data/nvjpeg_decoder.cc create mode 100644 paddle/fluid/operators/data/nvjpeg_decoder.h delete mode 100644 paddle/fluid/operators/decode_op.cu delete mode 100644 paddle/fluid/operators/nvjpeg_decoder.h delete mode 100644 paddle/fluid/operators/nvjpeg_decoder_thread_pool.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 2f485732cff904..906d280a77b059 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -10,6 +10,9 @@ op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) +cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS ${OP_HEADER_DEPS}) +op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) + # register_operators() # TODO: add test here diff --git a/paddle/fluid/operators/decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc similarity index 95% rename from paddle/fluid/operators/decode_op.cc rename to paddle/fluid/operators/data/batch_decode_op.cc index 98552e354057ef..c7fb58c3c2027e 100644 --- a/paddle/fluid/operators/decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -23,6 +23,7 @@ namespace paddle { namespace operators { +namespace data { template class CPUBatchDecodeJpegKernel : public framework::OpKernel { @@ -110,14 +111,15 @@ and 255. } }; +} // namespace data } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR( - decode, ops::BatchDecodeJpegOp, ops::BatchDecodeJpegOpMaker, + batch_decode, ops::data::BatchDecodeJpegOp, ops::data::BatchDecodeJpegOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker) -REGISTER_OP_CPU_KERNEL(decode, ops::CPUBatchDecodeJpegKernel) +REGISTER_OP_CPU_KERNEL(batch_decode, ops::data::CPUBatchDecodeJpegKernel) diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu new file mode 100644 index 00000000000000..8a08ba584dfead --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -0,0 +1,96 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/data/nvjpeg_decoder.h" + +namespace paddle { +namespace operators { +namespace data { + +static NvjpegDecoderThreadPool* decode_pool = nullptr; + +template +class GPUBatchDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int num_threads = ctx.Attr("num_threads"); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; + auto mode = ctx.Attr("mode"); + + // multi-phrase decode thread pool + if (!decode_pool) { + decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); + } + + const framework::LoDTensorArray* inputs = + ctx.Input("X"); + + auto* out = ctx.OutputVar("Out"); + auto& out_array = *out->GetMutable(); + out_array.resize(inputs->size()); + + for (size_t i = 0; i < inputs->size(); i++) { + const framework::LoDTensor x = inputs->at(i); + auto* x_data = x.data(); + size_t x_numel = static_cast(x.numel()); + + NvjpegDecodeTask task = { + .bit_stream = x_data, + .bit_len = x_numel, + .tensor = &out_array[i], + .place = ctx.GetPlace() + }; + decode_pool->AddTask(std::make_shared(task)); + } + + decode_pool->RunAll(true); + + // // multi-phrase decode single thread + // if (!nvjpeg_decoder) { + // nvjpeg_decoder = new NvjpegDecoder(mode); + // } + // + // const framework::LoDTensorArray* inputs = + // ctx.Input("X"); + // + // auto* out = ctx.OutputVar("Out"); + // auto& out_array = *out->GetMutable(); + // out_array.resize(inputs->size()); + // + // for (size_t i = 0; i < inputs->size(); i++) { + // const framework::LoDTensor x = inputs->at(i); + // auto* x_data = x.data(); + // + // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), + // &out_array[i], &ctx); + // } + + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(batch_decode, ops::data::GPUBatchDecodeJpegKernel) + +#endif diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc new file mode 100644 index 00000000000000..6b8c0844cfefc2 --- /dev/null +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -0,0 +1,246 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/data/nvjpeg_decoder.h" + +namespace paddle { +namespace operators { +namespace data { + +NvjpegDecoder::NvjpegDecoder(std::string mode) + : nvjpeg_streams_(2), + pinned_buffers_(2), + page_id_(0), + mode_(mode) { + // create cuda stream + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking)); + + // create nvjpeg handle and stream + // device_allocator_.dev_malloc = &cudaMalloc; + // device_allocator_.dev_free = &cudaFree; + // pinned_allocator_.pinned_malloc = &cudaMallocHost; + // pinned_allocator_.pinned_free = &cudaFreeHost; + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &device_allocator_, + &pinned_allocator_, 0, &handle_)); + for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); + } + + // create decode params, decoder and state + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_DEFAULT, &decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); + + // create device & pinned buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(handle_, &device_allocator_, &device_buffer_)); + for (size_t i = 0; i < pinned_buffers_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(handle_, &pinned_allocator_, &pinned_buffers_[i])); + } +} + +NvjpegDecoder::~NvjpegDecoder() { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); + + // destroy nvjpeg streams + for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); + } + + // destroy decode params, decoder and state + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsDestroy(decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderDestroy(decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStateDestroy(state_)); + + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_)); + for (size_t i = 0; i < pinned_buffers_.size(); i++) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); + } + + // destroy nvjpeg handle and cuda stream at last + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); +} + +void NvjpegDecoder::ParseOutputInfo( + const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, + nvjpegImage_t* out_image, platform::Place place) { + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, + &components, &subsampling, widths, heights)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + if (mode_ == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode_ == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode_ == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); + + std::vector out_shape = {output_components, height, width}; + out->Resize(framework::make_ddim(out_shape)); + + // allocate memory and assign to out_image + auto* data = out->mutable_data(place); + for (int c = 0; c < output_components; c++) { + out_image->channel[c] = data + c * width * height; + out_image->pitch[c] = width; + } +} + +void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { + auto buffer = pinned_buffers_[page_id_]; + auto stream = nvjpeg_streams_[page_id_]; + page_id_ ^= 1; + + // decode jpeg in host to pinned buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); + + // transfer and decode to device buffer + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, cuda_stream_)); + + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); +} + +void NvjpegDecoder::Run(const uint8_t* bit_stream, size_t bit_len, + framework::LoDTensor* out, platform::Place& place) { + nvjpegImage_t image; + ParseOutputInfo(bit_stream, bit_len, out, &image, place); + Decode(bit_stream, bit_len, &image); +} + +NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode) + : threads_(num_threads), + mode_(mode), + shutdown_(false), + running_(false), + completed_(false), + outstand_tasks_(0) { + PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( + "num_threads shoule be a positive interger, " + "but got %d", num_threads)); + for (int i = 0; i < num_threads; i++) { + threads_.emplace_back( + std::thread(std::bind(&NvjpegDecoderThreadPool::ThreadLoop, this, i))); + } +} + +NvjpegDecoderThreadPool::~NvjpegDecoderThreadPool() { Shutdown(); } + +void NvjpegDecoderThreadPool::AddTask(std::shared_ptr task) { + task_queue_.push_back(task); +} + +void NvjpegDecoderThreadPool::RunAll(const bool wait, const bool sort) { + // Sort images in length desending order + if (sort) SortTaskByLengthDescend(); + + { + std::lock_guard lock(mutex_); + completed_ = false; + running_ = true; + } + running_cond_.notify_all(); + + if (wait) WaitTillTasksCompleted(); +} + +void NvjpegDecoderThreadPool::WaitTillTasksCompleted() { + std::unique_lock lock(mutex_); + completed_cond_.wait(lock, [this] { return this->completed_; }); + running_ = false; +} + +void NvjpegDecoderThreadPool::Shutdown() { + std::lock_guard lock(mutex_); + + running_ = false; + shutdown_.store(true); + running_cond_.notify_all(); + + task_queue_.clear(); + + for (auto &thread : threads_) { + thread.join(); + } +} + +void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { + std::lock_guard lock(mutex_); + std::sort(task_queue_.begin(), task_queue_.end(), + [](const std::shared_ptr a, + const std::shared_ptr b) { + return b->bit_len < a->bit_len; + }); +} + +void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { + NvjpegDecoder* decoder = new NvjpegDecoder(mode_); + + while (!shutdown_.load()) { + std::unique_lock lock(mutex_); + running_cond_.wait(lock, [this] { return running_ && !task_queue_.empty(); }); + if (shutdown_.load()) break; + + auto task = task_queue_.front(); + task_queue_.pop_front(); + outstand_tasks_++; + lock.unlock(); + + decoder->Run(task->bit_stream, task->bit_len, task->tensor, task->place); + + lock.lock(); + outstand_tasks_--; + if (outstand_tasks_ == 0 && task_queue_.empty()) { + completed_ = true; + lock.unlock(); + completed_cond_.notify_one(); + } + } +} + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h new file mode 100644 index 00000000000000..3467a70b6082be --- /dev/null +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { +namespace data { + +static int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); } +static int dev_free(void *p) { return (int)cudaFree(p); } + +static int host_malloc(void** p, size_t s, unsigned int f) { return (int)cudaHostAlloc(p, s, f); } +static int host_free(void* p) { return (int)cudaFreeHost(p); } + +struct NvjpegDecodeTask { + const uint8_t* bit_stream; + size_t bit_len; + framework::LoDTensor* tensor; + platform::Place place; +}; + +class NvjpegDecoder { + public: + NvjpegDecoder(const std::string mode); + + ~NvjpegDecoder(); + + void Run(const uint8_t* bit_stream, size_t bit_len, + framework::LoDTensor* out, platform::Place& place); + + private: + DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); + + void ParseOutputInfo( + const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, + nvjpegImage_t* out_image, platform::Place place); + + void Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image); + + + cudaStream_t cuda_stream_ = nullptr; + std::vector nvjpeg_streams_; + + nvjpegHandle_t handle_ = nullptr; + nvjpegJpegState_t state_ = nullptr; + nvjpegJpegDecoder_t decoder_ = nullptr; + nvjpegDecodeParams_t decode_params_ = nullptr; + + nvjpegPinnedAllocator_t pinned_allocator_ = {&host_malloc, &host_free}; + nvjpegDevAllocator_t device_allocator_ = {&dev_malloc, &dev_free}; + std::vector pinned_buffers_; + nvjpegBufferDevice_t device_buffer_ = nullptr; + + int page_id_; + + const std::string mode_; +}; + +class NvjpegDecoderThreadPool { + public: + NvjpegDecoderThreadPool(const int num_threads, const std::string mode); + + ~NvjpegDecoderThreadPool(); + + void AddTask(std::shared_ptr task); + + void RunAll(const bool wait, const bool sort = true); + + void WaitTillTasksCompleted(); + + void Shutdown(); + + private: + DISABLE_COPY_AND_ASSIGN(NvjpegDecoderThreadPool); + + void SortTaskByLengthDescend(); + + void ThreadLoop(const int thread_idx); + + std::vector threads_; + std::string mode_; + + std::deque> task_queue_; + std::mutex mutex_; + + std::atomic shutdown_; + std::condition_variable running_cond_; + bool running_; + std::condition_variable completed_cond_; + bool completed_; + + int outstand_tasks_; +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index a6c05c6a3623f7..bc15a29e43743c 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -7,9 +7,12 @@ register_unity_group(cc pipeline.cc map_runner.cc + nvjpeg_decoder.cc dataloader_op.cc - map_op.cc) + map_op.cc + batch_decode_op.cc) register_unity_group(cu dataloader_op.cu.cc - map_op.cu.cc) + map_op.cu.cc + batch_decode_op.cu) diff --git a/paddle/fluid/operators/decode_op.cu b/paddle/fluid/operators/decode_op.cu deleted file mode 100644 index 06109d5aae4f23..00000000000000 --- a/paddle/fluid/operators/decode_op.cu +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/dynload/nvjpeg.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/stream/cuda_stream.h" -#include "paddle/fluid/operators/nvjpeg_decoder_thread_pool.h" - -namespace paddle { -namespace operators { - -static std::vector nvjpeg_streams; -static nvjpegHandle_t batch_nvjpeg_handle = nullptr; -static std::unique_ptr<::ThreadPool> pool_; - -// static NvjpegDecoder* nvjpeg_decoder = nullptr; -static NvjpegDecoderThreadPool* decode_pool = nullptr; - -void batch_InitNvjpegImage(nvjpegImage_t* img) { - for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { - img->channel[c] = nullptr; - img->pitch[c] = 0; - } -} - -template -class GPUBatchDecodeJpegKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int num_threads = ctx.Attr("num_threads"); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; - auto mode = ctx.Attr("mode"); - - // multi-phrase decode thread pool - if (!decode_pool) { - decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); - } - - const framework::LoDTensorArray* inputs = - ctx.Input("X"); - - auto* out = ctx.OutputVar("Out"); - auto& out_array = *out->GetMutable(); - out_array.resize(inputs->size()); - - for (size_t i = 0; i < inputs->size(); i++) { - const framework::LoDTensor x = inputs->at(i); - auto* x_data = x.data(); - size_t x_numel = static_cast(x.numel()); - - NvjpegDecodeWork work = { - .bit_stream = x_data, - .bit_len = x_numel, - .tensor = &out_array[i], - .ctx = ctx - }; - decode_pool->AddWork(std::make_shared(work)); - } - - decode_pool->RunAll(true); - - // // multi-phrase decode single thread - // if (!nvjpeg_decoder) { - // nvjpeg_decoder = new NvjpegDecoder(mode); - // } - // - // const framework::LoDTensorArray* inputs = - // ctx.Input("X"); - // - // auto* out = ctx.OutputVar("Out"); - // auto& out_array = *out->GetMutable(); - // out_array.resize(inputs->size()); - // - // for (size_t i = 0; i < inputs->size(); i++) { - // const framework::LoDTensor x = inputs->at(i); - // auto* x_data = x.data(); - // - // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), - // &out_array[i], &ctx); - // } - - // // Create nvJPEG handle - // if (batch_nvjpeg_handle == nullptr) { - // nvjpegStatus_t create_status = - // platform::dynload::nvjpegCreateSimple(&batch_nvjpeg_handle); - // - // PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, - // platform::errors::Fatal("nvjpegCreateSimple failed: ", - // create_status)); - // - // nvjpeg_streams.reserve(num_threads_); - // - // for (int i = 0; i < num_threads_; i++) { - // cudaStreamCreateWithFlags(&nvjpeg_streams[i], cudaStreamNonBlocking); - // } - // pool_.reset(new ::ThreadPool(num_threads_)); - // } - // - // const framework::LoDTensorArray* ins = - // ctx.Input("X"); - // - // auto* out = ctx.OutputVar("Out"); - // auto& out_array = *out->GetMutable(); - // out_array.resize(ins->size()); - // - // std::vector> tasks(ins->size()); - // - // auto dev = ctx.GetPlace(); - // for (int i = 0; i < ins->size(); i++) { - // auto nvjpeg_stream = nvjpeg_streams[i % num_threads_]; - // auto nvjpeg_handle = batch_nvjpeg_handle; - // tasks[i] = pool_->enqueue([this, i, ins, &out_array, mode, nvjpeg_handle, - // nvjpeg_stream, dev]() -> int { - // nvjpegJpegState_t nvjpeg_state; - // nvjpegStatus_t state_status = platform::dynload::nvjpegJpegStateCreate( - // batch_nvjpeg_handle, &nvjpeg_state); - // - // PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, - // platform::errors::Fatal( - // "nvjpegJpegStateCreate failed: ", state_status)); - // const framework::LoDTensor x = ins->at(i); - // // framework::LoDTensor out = out_array.at(i); - // int components; - // nvjpegChromaSubsampling_t subsampling; - // int widths[NVJPEG_MAX_COMPONENT]; - // int heights[NVJPEG_MAX_COMPONENT]; - // - // auto* x_data = x.data(); - // - // nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( - // batch_nvjpeg_handle, x_data, (size_t)x.numel(), &components, - // &subsampling, widths, heights); - // - // PADDLE_ENFORCE_EQ(info_status, NVJPEG_STATUS_SUCCESS, - // platform::errors::Fatal("nvjpegGetImageInfo failed: ", - // info_status)); - // - // int width = widths[0]; - // int height = heights[0]; - // - // nvjpegOutputFormat_t output_format; - // int output_components; - // - // if (mode == "unchanged") { - // if (components == 1) { - // output_format = NVJPEG_OUTPUT_Y; - // output_components = 1; - // } else if (components == 3) { - // output_format = NVJPEG_OUTPUT_RGB; - // output_components = 3; - // } else { - // platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); - // PADDLE_THROW(platform::errors::Fatal( - // "The provided mode is not supported for JPEG files on GPU")); - // } - // } else if (mode == "gray") { - // output_format = NVJPEG_OUTPUT_Y; - // output_components = 1; - // } else if (mode == "rgb") { - // output_format = NVJPEG_OUTPUT_RGB; - // output_components = 3; - // } else { - // platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); - // PADDLE_THROW(platform::errors::Fatal( - // "The provided mode is not supported for JPEG files on GPU")); - // } - // - // nvjpegImage_t out_image; - // batch_InitNvjpegImage(&out_image); - // - // // create nvjpeg stream - // // if (batch_nvjpeg_stream == nullptr) { - // // cudaStreamCreateWithFlags(&batch_nvjpeg_stream, - // // cudaStreamNonBlocking); - // // } - // - // int sz = widths[0] * heights[0]; - // - // // auto* out = ctx.Output("Out"); - // std::vector out_shape = {output_components, height, width}; - // out_array.at(i).Resize(framework::make_ddim(out_shape)); - // - // uint8_t* data = out_array.at(i).mutable_data(dev); - // // transfer and decode to device buffer - // - // for (int c = 0; c < output_components; c++) { - // out_image.channel[c] = data + c * sz; - // out_image.pitch[c] = width; - // } - // - // nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( - // batch_nvjpeg_handle, nvjpeg_state, x_data, x.numel(), output_format, - // &out_image, nvjpeg_stream); - // // std:: cout << "task read ok: " << i << std:: endl; - // return 0; - // }); - // } - // - // for (size_t i = 0; i < tasks.size(); ++i) { - // tasks[i].wait(); - // } - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(decode, ops::GPUBatchDecodeJpegKernel) - -#endif diff --git a/paddle/fluid/operators/nvjpeg_decoder.h b/paddle/fluid/operators/nvjpeg_decoder.h deleted file mode 100644 index 5b9ecf2780dae5..00000000000000 --- a/paddle/fluid/operators/nvjpeg_decoder.h +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/dynload/nvjpeg.h" -#include "paddle/fluid/platform/stream/cuda_stream.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; - -static int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); } -static int dev_free(void *p) { return (int)cudaFree(p); } - -static int host_malloc(void** p, size_t s, unsigned int f) { - return (int)cudaHostAlloc(p, s, f); -} - -static int host_free(void* p) { return (int)cudaFreeHost(p); } - -class NvjpegDecoder { - public: - NvjpegDecoder(std::string mode) - : nvjpeg_streams_(2), - page_id_(0), - pinned_buffers_(2), - mode_(mode) { - // create cuda stream - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking)); - - // create nvjpeg handle and stream - // device_allocator_.dev_malloc = &cudaMalloc; - // device_allocator_.dev_free = &cudaFree; - // pinned_allocator_.pinned_malloc = &cudaMallocHost; - // pinned_allocator_.pinned_free = &cudaFreeHost; - PADDLE_ENFORCE_NVJPEG_SUCCESS( - platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &device_allocator_, - &pinned_allocator_, 0, &handle_)); - for (size_t i; i < nvjpeg_streams_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); - } - - // create decode params, decoder and state - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_DEFAULT, &decoder_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); - - // create device & pinned buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(handle_, &device_allocator_, &device_buffer_)); - for (size_t i = 0; i < pinned_buffers_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(handle_, &pinned_allocator_, &pinned_buffers_[i])); - } - } - - ~NvjpegDecoder() { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); - - // destroy nvjpeg streams - for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); - } - - // destroy decode params, decoder and state - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsDestroy(decode_params_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderDestroy(decoder_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStateDestroy(state_)); - - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_)); - for (size_t i = 0; i < pinned_buffers_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); - } - - // destroy nvjpeg handle and cuda stream at last - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); - } - - void ParseOutputInfo( - const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, - nvjpegImage_t* out_image, framework::ExecutionContext& ctx) { - int components; - nvjpegChromaSubsampling_t subsampling; - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - - PADDLE_ENFORCE_NVJPEG_SUCCESS( - platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, - &components, &subsampling, widths, heights)); - - int width = widths[0]; - int height = heights[0]; - - nvjpegOutputFormat_t output_format; - int output_components; - - if (mode_ == "unchanged") { - if (components == 1) { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (components == 3) { - output_format = NVJPEG_OUTPUT_RGB; - output_components = 3; - } else { - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); - } - } else if (mode_ == "gray") { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (mode_ == "rgb") { - output_format = NVJPEG_OUTPUT_RGB; - output_components = 3; - } else { - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); - } - - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); - - std::vector out_shape = {output_components, height, width}; - out->Resize(framework::make_ddim(out_shape)); - - // allocate memory and assign to out_image - auto* data = out->mutable_data(ctx.GetPlace()); - for (int c = 0; c < output_components; c++) { - out_image->channel[c] = data + c * width * height; - out_image->pitch[c] = width; - } - } - - void Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { - auto buffer = pinned_buffers_[page_id_]; - auto stream = nvjpeg_streams_[page_id_]; - page_id_ ^= 1; - - // decode jpeg in host to pinned buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); - - // transfer and decode to device buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, cuda_stream_)); - - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); - } - - void Run(const uint8_t* bit_stream, size_t bit_len, LoDTensor* out, framework::ExecutionContext& ctx) { - nvjpegImage_t image; - ParseOutputInfo(bit_stream, bit_len, out, &image, ctx); - Decode(bit_stream, bit_len, &image); - } - - private: - DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); - - cudaStream_t cuda_stream_ = nullptr; - std::vector nvjpeg_streams_; - - nvjpegHandle_t handle_ = nullptr; - nvjpegJpegState_t state_ = nullptr; - nvjpegJpegDecoder_t decoder_ = nullptr; - nvjpegDecodeParams_t decode_params_ = nullptr; - - nvjpegPinnedAllocator_t pinned_allocator_ = {&host_malloc, &host_free}; - nvjpegDevAllocator_t device_allocator_ = {&dev_malloc, &dev_free}; - std::vector pinned_buffers_; - nvjpegBufferDevice_t device_buffer_ = nullptr; - - int page_id_; - - const std::string mode_; -}; - - -// class NvjpegDecoderWorkerPool { -// public: -// NvjpegDecoderWorkerPool(const int num_threads, ) -// -// private: -// DISABLE_COPY_AND_ASSIGN(NvjpegDecoderWorkerPool); -// -// struct NvjpegDecoderTask { -// const uint8_t* bit_stream; -// const size_t bit_len; -// LoDTensor* out; -// } -// -// class NvjpegDecoderWorker { -// public: -// NvjpegDecoderWorker( -// const std::string mode, framework::ExecutionContext ctx, -// const int capacity) -// : mode_(mode), -// ctx_(ctx), -// capacity_(capacity), -// pool_(1) { -// -// } -// -// private: -// const std::string mode_; -// const framework::ExecutionContext ctx_; -// -// BlockingQueue -// ThreadPool pool_; -// } -// -// } - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h b/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h deleted file mode 100644 index 99b18e810f5a73..00000000000000 --- a/paddle/fluid/operators/nvjpeg_decoder_thread_pool.h +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/nvjpeg_decoder.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; - -struct NvjpegDecodeWork { - const uint8_t* bit_stream; - size_t bit_len; - LoDTensor* tensor; - framework::ExecutionContext ctx; -}; - -class NvjpegDecoderThreadPool { - public: - NvjpegDecoderThreadPool(const int num_threads, const std::string mode) - : threads_(num_threads), - mode_(mode), - shutdown_(false), - running_(false), - completed_(false), - outstand_works_(0) { - PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( - "num_threads shoule be a positive interger, " - "but got %d", num_threads)); - for (int i = 0; i < num_threads; i++) { - threads_.emplace_back( - std::thread(std::bind(&NvjpegDecoderThreadPool::ThreadLoop, this, i))); - } - } - - void AddWork(std::shared_ptr work) { - work_queue_.push_back(work); - } - - void RunAll(const bool wait, const bool sort = true) { - // Sort images in length desending order - if (sort) SortWorkByLengthDescend(); - - { - std::lock_guard lock(mutex_); - completed_ = false; - running_ = true; - } - running_cond_.notify_all(); - - if (wait) WaitTillWorksCompleted(); - } - - void WaitTillWorksCompleted() { - std::unique_lock lock(mutex_); - completed_cond_.wait(lock, [this] { return this->completed_; }); - running_ = false; - } - - void Shutdown() { - std::lock_guard lock(mutex_); - - running_ = false; - shutdown_.store(true); - running_cond_.notify_all(); - - work_queue_.clear(); - - for (auto &thread : threads_) { - thread.join(); - } - } - - private: - std::vector threads_; - std::string mode_; - - std::deque> work_queue_; - std::mutex mutex_; - - std::atomic shutdown_; - std::condition_variable running_cond_; - bool running_; - std::condition_variable completed_cond_; - bool completed_; - - int outstand_works_; - - void SortWorkByLengthDescend() { - std::lock_guard lock(mutex_); - std::sort(work_queue_.begin(), work_queue_.end(), - [](const std::shared_ptr a, - const std::shared_ptr b) { - return b->bit_len < a->bit_len; - }); - } - - void ThreadLoop(const int thread_idx) { - NvjpegDecoder* decoder = new NvjpegDecoder(mode_); - - while (!shutdown_.load()) { - std::unique_lock lock(mutex_); - running_cond_.wait(lock, [this] { return running_ && !work_queue_.empty(); }); - if (shutdown_.load()) break; - - auto work = work_queue_.front(); - work_queue_.pop_front(); - outstand_works_++; - lock.unlock(); - - decoder->Run(work->bit_stream, work->bit_len, work->tensor, work->ctx); - - lock.lock(); - outstand_works_--; - if (outstand_works_ == 0 && work_queue_.empty()) { - completed_ = true; - lock.unlock(); - completed_cond_.notify_one(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 6d17a83a2af791..3c03418a0539ca 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -957,14 +957,14 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): inputs = {'X': x} attrs = {"mode": mode, "num_threads": num_threads} - helper = LayerHelper("decode", **locals()) + helper = LayerHelper("batch_decode", **locals()) out = helper.create_variable( name=unique_name.generate("image_decode"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype=x.dtype) # out = helper.create_variable_for_type_inference('uint8') helper.append_op( - type="decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) + type="batch_decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) return out From 7ab1889e2d12abcd9c97f3e3453a1e0f9b5a019d Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 30 Nov 2021 08:59:01 +0000 Subject: [PATCH 20/95] why 2nd op not run --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 3 + .../fast_threaded_ssa_graph_executor.cc | 2 + .../fluid/framework/details/op_handle_base.cc | 1 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../fluid/framework/ir/data_io_queue_pass.cc | 68 +++++++++++++++++++ paddle/fluid/framework/variable_helper.cc | 3 + .../fluid/operators/data/batch_decode_op.cu | 37 ++++++++-- .../fluid/operators/file_label_reader_op.cc | 33 +++++++-- 9 files changed, 138 insertions(+), 12 deletions(-) create mode 100644 paddle/fluid/framework/ir/data_io_queue_pass.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 87f77ec2fff3a6..149b167f7ff7b3 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass data_io_queue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index cee97820d6a033..791726433d9ff5 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -83,6 +83,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to check whether the multi_device_graph is right. AppendPass("multi_devices_check_pass"); + AppendPass("data_io_queue_pass"); + SetCollectiveContext(); } @@ -493,6 +495,7 @@ USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); USE_PASS(add_reader_dependency_pass); +USE_PASS(data_io_queue_pass); #ifdef PADDLE_WITH_CINN USE_PASS(build_cinn_pass); #endif diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 75998e4582e2bc..9c9e7e21361d1e 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -262,8 +262,10 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( auto &outputs = op_to_run->Outputs(); op_to_run = nullptr; for (auto &output : outputs) { + LOG(ERROR) << "op output " << output->Name(); for (auto &pending_op : output->PendingOps()) { std::atomic &deps = op_deps->at(pending_op); + LOG(ERROR) << "pending_op: " << pending_op->Name() << ", " << deps.load(); if (deps.fetch_sub(1) != 1) continue; // NOTE(zjl): op with highest priority should run diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4b5d0563d73946..6690fabcc68ec7 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -349,6 +349,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, size_t OpHandleBase::NotReadyInputSize() const { std::unordered_set res; for (auto *var : inputs_) { + LOG(ERROR) << "NotReadyInputSize: op " << Name() << ", var " << var->Name() << ", GeneratedOp " << var->GeneratedOp(); if (var->GeneratedOp() != nullptr) { res.emplace(var); } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 80ae0f04daa4a0..46b617719bee18 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -98,6 +98,7 @@ pass_library(unsqueeze2_eltwise_fuse_pass inference) pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(generate_pass DEPS pass_desc_proto) +pass_library(data_io_queue_pass base) target_link_libraries(generate_pass pass_desc_proto) if(WITH_GPU OR WITH_ROCM) pass_library(cudnn_placement_pass base DEPS placement_pass_base) diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/data_io_queue_pass.cc new file mode 100644 index 00000000000000..c38edb39994edb --- /dev/null +++ b/paddle/fluid/framework/ir/data_io_queue_pass.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +static int MAX_VARS_LEN = 100; + +class DataIOQueuePass: public Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + // VLOG(3) << "Change inputs/outputs of data ops to queue"; + LOG(ERROR) << "Change inputs/outputs of data ops to queue"; + std::vector var_names; + var_names.reserve(MAX_VARS_LEN); + for (const Node *n : graph->Nodes()) { + if (n->IsOp() && n->Op()) { + auto *op = n->Op(); + if (op->Type() == "file_label_reader" + || op->Type() == "batch_decode" + || op->Type() == "map") { + auto& outputs = op->Outputs(); + for (auto iter = outputs.begin(); iter != outputs.end(); iter++) { + auto vars = iter->second; + std::copy(vars.begin(), vars.end(), std::back_inserter(var_names)); + } + } + } + } + + for (const Node *n : graph->Nodes()) { + if (n->IsVar() && n->Var()) { + auto *var = n->Var(); + auto iter = std::find(var_names.begin(), var_names.end(), var->Name()); + if (iter != var_names.end()) { + var->SetType(framework::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); + var->SetPersistable(true); + } + } + } + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(data_io_queue_pass, paddle::framework::ir::DataIOQueuePass); diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 37ec5d7bc83bda..cbc1e86d54f996 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -42,6 +43,8 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_BLOCKING_QUEUE) { + var->GetMutable(); } else if (var_type == proto::VarType::STRINGS) { var->GetMutable(); } else if (var_type == proto::VarType::VOCAB) { diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index 8a08ba584dfead..9ccc09f675d3a3 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -19,11 +19,14 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { namespace data { +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; + static NvjpegDecoderThreadPool* decode_pool = nullptr; template @@ -39,15 +42,34 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); } - const framework::LoDTensorArray* inputs = - ctx.Input("X"); + // const framework::LoDTensorArray* inputs = + // ctx.Input("X"); + // + // auto* out = ctx.OutputVar("Out"); + // auto& out_array = *out->GetMutable(); + // out_array.resize(inputs->size()); + + auto* in_var = ctx.InputVar("X"); + auto in_queue = in_var->Get().GetQueue(); + + auto* out_var = ctx.OutputVar("Out"); + auto holder = out_var->Get(); + auto out_queue = holder.GetQueue(); + if (out_queue == nullptr) { + holder.InitOnce(2); + out_queue = holder.GetQueue(); + } + + bool success = true; + auto inputs = in_queue->Pop(&success); + PADDLE_ENFORCE_EQ(success, true, + platform::errors::PreconditionNotMet("Read from input queue failed")); - auto* out = ctx.OutputVar("Out"); - auto& out_array = *out->GetMutable(); - out_array.resize(inputs->size()); + framework::LoDTensorArray out_array; + out_array.reserve(inputs.size()); - for (size_t i = 0; i < inputs->size(); i++) { - const framework::LoDTensor x = inputs->at(i); + for (size_t i = 0; i < inputs.size(); i++) { + const framework::LoDTensor x = inputs.at(i); auto* x_data = x.data(); size_t x_numel = static_cast(x.numel()); @@ -61,6 +83,7 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { } decode_pool->RunAll(true); + out_queue->Push(out_array); // // multi-phrase decode single thread // if (!nvjpeg_decoder) { diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 9d90717d0624e5..95b0af35f4f208 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -21,11 +21,13 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { using LoDTensorArray = framework::LoDTensorArray; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; enum BufferStatus { kBufferStatusSuccess = 0, @@ -194,6 +196,22 @@ class FileDataReaderWrapper { FileDataReaderWrapper reader_wrapper; +static void CheckAndInitQueue(framework::Variable* var, int capacity) { + if (var->IsInitialized()) { + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::InvalidArgument( + "Variable should hold LoDTensorBlockingQueueHolder type")); + auto holder = var->Get(); + if (holder.GetQueue() == nullptr) { + holder.InitOnce(capacity); + } + } else { + LOG(ERROR) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; + auto* holder = var->GetMutable(); + holder->InitOnce(capacity); + } +} + template class CPUFileLabelKernel : public framework::OpKernel { public: @@ -236,11 +254,18 @@ class FileLabelReaderOp : public framework::OperatorBase { } LoDTensorArray samples = reader_wrapper.reader->Next(); auto* out = scope.FindVar(Output("Out")); - auto& out_array = *out->GetMutable(); - out_array.resize(samples.size()); - for (size_t i = 0; i < samples.size(); ++i) { - copy_tensor(samples[i], &out_array[i]); + auto holder = out->Get(); + auto out_queue = holder.GetQueue(); + if (out_queue == nullptr) { + holder.InitOnce(2); + out_queue = holder.GetQueue(); } + // framework::LoDTensorArray out_array; + // out_array.resize(samples.size()); + // for (size_t i = 0; i < samples.size(); ++i) { + // copy_tensor(samples[i], &out_array[i]); + // } + out_queue->Push(samples); LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; } From 256d7b8d037bcd28eccdc64c1d84020b19fd4e7f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 1 Dec 2021 13:53:43 +0000 Subject: [PATCH 21/95] run success --- .../fast_threaded_ssa_graph_executor.cc | 2 - .../fluid/framework/details/op_handle_base.cc | 1 - paddle/fluid/framework/executor_gc_helper.cc | 3 +- .../fluid/framework/ir/data_io_queue_pass.cc | 1 + .../fluid/operators/data/batch_decode_op.cc | 7 ++- .../fluid/operators/data/batch_decode_op.cu | 13 ++--- paddle/fluid/operators/data/pipeline.cc | 10 +++- paddle/fluid/operators/data/pipeline.h | 1 + .../fluid/operators/file_label_reader_op.cc | 23 +++++---- .../operators/random_crop_and_resize_op.cc | 4 +- .../operators/random_crop_and_resize_op.cu | 51 ++++++++++++++----- 11 files changed, 80 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 9c9e7e21361d1e..75998e4582e2bc 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -262,10 +262,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( auto &outputs = op_to_run->Outputs(); op_to_run = nullptr; for (auto &output : outputs) { - LOG(ERROR) << "op output " << output->Name(); for (auto &pending_op : output->PendingOps()) { std::atomic &deps = op_deps->at(pending_op); - LOG(ERROR) << "pending_op: " << pending_op->Name() << ", " << deps.load(); if (deps.fetch_sub(1) != 1) continue; // NOTE(zjl): op with highest priority should run diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 6690fabcc68ec7..4b5d0563d73946 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -349,7 +349,6 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, size_t OpHandleBase::NotReadyInputSize() const { std::unordered_set res; for (auto *var : inputs_) { - LOG(ERROR) << "NotReadyInputSize: op " << Name() << ", var " << var->Name() << ", GeneratedOp " << var->GeneratedOp(); if (var->GeneratedOp() != nullptr) { res.emplace(var); } diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 8c64d65ff4be66..1fbe8b4d468ef7 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -46,7 +46,8 @@ static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block, return type == proto::VarType::LOD_TENSOR || type == proto::VarType::SELECTED_ROWS || - type == proto::VarType::LOD_TENSOR_ARRAY; + type == proto::VarType::LOD_TENSOR_ARRAY || + type == proto::VarType::LOD_TENSOR_BLOCKING_QUEUE; } std::unordered_map> diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/data_io_queue_pass.cc index c38edb39994edb..8cfa5452647181 100644 --- a/paddle/fluid/framework/ir/data_io_queue_pass.cc +++ b/paddle/fluid/framework/ir/data_io_queue_pass.cc @@ -38,6 +38,7 @@ class DataIOQueuePass: public Pass { auto *op = n->Op(); if (op->Type() == "file_label_reader" || op->Type() == "batch_decode" + || op->Type() == "random_crop_and_resize" || op->Type() == "map") { auto& outputs = op->Outputs(); for (auto iter = outputs.begin(); iter != outputs.end(); iter++) { diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index c7fb58c3c2027e..6c78118a8e0faa 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -30,6 +31,7 @@ class CPUBatchDecodeJpegKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { // TODO(LieLinJiang): add cpu implement. + LOG(ERROR) << "CPUBatchDecodeJpegKernel enter"; PADDLE_THROW(platform::errors::Unimplemented( "DecodeJpeg op only supports GPU now.")); } @@ -63,13 +65,16 @@ class BatchDecodeJpegOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { + // return framework::OpKernelType( + // OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + framework::proto::VarType::UINT8, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const { + LOG(ERROR) << "GetKernelTypeForVar enter "; if (var_name == "X") { return expected_kernel_type; } diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index 9ccc09f675d3a3..6e22423c93589f 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -39,6 +39,7 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { // multi-phrase decode thread pool if (!decode_pool) { + LOG(ERROR) << "decode_pool init"; decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); } @@ -53,20 +54,20 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { auto in_queue = in_var->Get().GetQueue(); auto* out_var = ctx.OutputVar("Out"); - auto holder = out_var->Get(); - auto out_queue = holder.GetQueue(); + auto out_queue = out_var->Get().GetQueue(); if (out_queue == nullptr) { - holder.InitOnce(2); - out_queue = holder.GetQueue(); + LOG(ERROR) << "decode init output queue"; + auto* holder = out_var->template GetMutable(); + holder->InitOnce(2); + out_queue = holder->GetQueue(); } bool success = true; auto inputs = in_queue->Pop(&success); PADDLE_ENFORCE_EQ(success, true, platform::errors::PreconditionNotMet("Read from input queue failed")); - framework::LoDTensorArray out_array; - out_array.reserve(inputs.size()); + out_array.resize(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { const framework::LoDTensor x = inputs.at(i); diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index bc67fc3e01c3e7..82fa06df714531 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -82,8 +82,14 @@ void Pipeline::StartPrefetchThread(std::shared_ptr executor, "The output variable %s is not found in DataLoader " "program's internal scope", output_var_names_[i])); - CheckOutputVarStatus(*out_var, output_var_names_[i]); - copy_tensor(out_var->Get(), &t_arr[i]); + // CheckOutputVarStatus(*out_var, output_var_names_[i]); + // copy_tensor(out_var->Get(), &t_arr[i]); + auto out_queue = out_var->Get().GetQueue(); + bool success = true; + auto outputs = out_queue->Pop(&success); + PADDLE_ENFORCE_EQ(success, true, + platform::errors::PreconditionNotMet("Read from input queue failed")); + copy_tensor(outputs.at(0), &t_arr[i]); } // TODO: dataset drain check diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 2f0f10abe65579..bc2776f5675a14 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -30,6 +30,7 @@ using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; namespace data { diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 95b0af35f4f208..459d363a59b7dc 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -254,18 +254,21 @@ class FileLabelReaderOp : public framework::OperatorBase { } LoDTensorArray samples = reader_wrapper.reader->Next(); auto* out = scope.FindVar(Output("Out")); - auto holder = out->Get(); - auto out_queue = holder.GetQueue(); + // auto* holder = out->template GetMutable(); + auto out_queue = out->Get().GetQueue(); if (out_queue == nullptr) { - holder.InitOnce(2); - out_queue = holder.GetQueue(); + LOG(ERROR) << "init output queue"; + auto* holder = out->template GetMutable(); + holder->InitOnce(2); + out_queue = holder->GetQueue(); } - // framework::LoDTensorArray out_array; - // out_array.resize(samples.size()); - // for (size_t i = 0; i < samples.size(); ++i) { - // copy_tensor(samples[i], &out_array[i]); - // } - out_queue->Push(samples); + + framework::LoDTensorArray out_array; + out_array.resize(samples.size()); + for (size_t i = 0; i < samples.size(); ++i) { + copy_tensor(samples[i], &out_array[i]); + } + out_queue->Push(out_array); LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; } diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/random_crop_and_resize_op.cc index 1bd8d481808b2f..a0d7e79973a59a 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/random_crop_and_resize_op.cc @@ -48,8 +48,10 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { + // return framework::OpKernelType( + // OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + framework::proto::VarType::UINT8, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu index 1aaf347af7b9ed..684de9c9ffe9e8 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -15,12 +15,14 @@ #include "paddle/fluid/operators/random_crop_and_resize_op.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { using framework::LoDTensor; using DataLayout = framework::DataLayout; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; template __global__ void KeNearestNeighborInterpFw( @@ -278,11 +280,31 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { platform::errors::NotFound("This kernel only runs on GPU device.")); // get input, output // auto& x = ctx.MultiInput("X"); - auto* x = ctx.Input("X"); - PADDLE_ENFORCE_GT(x->size(), 0, - platform::errors::InvalidArgument( - "The size of X must be greater than 0.")); - auto* out = ctx.Output("Out"); + // auto* x = ctx.Input("X"); + // PADDLE_ENFORCE_GT(x->size(), 0, + // platform::errors::InvalidArgument( + // "The size of X must be greater than 0.")); + // auto* out = ctx.Output("Out"); + + auto* in_var = ctx.InputVar("X"); + auto in_queue = in_var->Get().GetQueue(); + LOG(ERROR) << "crop resize in_var: " << in_var << "in_queue: " << in_queue; + + auto* out_var = ctx.OutputVar("Out"); + auto out_queue = out_var->Get().GetQueue(); + if (out_queue == nullptr) { + LOG(ERROR) << "crop resize init output queue"; + auto* holder = out_var->template GetMutable(); + holder->InitOnce(2); + out_queue = holder->GetQueue(); + } + + bool success = false; + auto x = in_queue->Pop(&success); + PADDLE_ENFORCE_EQ(success, true, + platform::errors::PreconditionNotMet("Read from input queue failed")); + framework::LoDTensor out; + // get size, scale, ratio auto size = ctx.Attr>("size"); auto scale = ctx.Attr>("scale"); @@ -298,18 +320,18 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); - auto* img = &x->at(0); + auto* img = &x.at(0); int64_t img_c = data_layout == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; - std::vector out_dim = {static_cast(x->size()), + std::vector out_dim = {static_cast(x.size()), img_c, size[0], size[1]}; - out->Resize(framework::make_ddim(out_dim)); - out->mutable_data(ctx.GetPlace()); + out.Resize(framework::make_ddim(out_dim)); + out.mutable_data(ctx.GetPlace()); int img_h, img_w, idx_h, idx_w, crop_h, crop_w; - for (int i = 0; i < x->size(); i++) { - img = &x->at(i); + for (int i = 0; i < x.size(); i++) { + img = &x.at(i); img_h = data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = @@ -317,11 +339,16 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, &crop_w, seed); - auto out_tensor = out->Slice(i, i + 1); + auto out_tensor = out.Slice(i, i + 1); RandomCropAndResizeFwd(ctx, *img, &out_tensor, size, interp_method, align_corners, align_mode, img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w, data_layout); } + + framework::LoDTensorArray out_array; + out_array.reserve(1); + out_array.emplace_back(out); + out_queue->Push(out_array); LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute finish"; } }; From 41cebd6aa3a8463dd702324d4c67c6d1178767d1 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 6 Dec 2021 12:21:42 +0000 Subject: [PATCH 22/95] map success --- paddle/fluid/operators/data/map_op.cc | 78 ++++++++-- paddle/fluid/operators/data/map_op.h | 53 ++++--- paddle/fluid/operators/data/map_runner.cc | 142 ++++++++++++++---- paddle/fluid/operators/data/map_runner.h | 29 ++-- paddle/fluid/operators/data/pipeline.cc | 2 + .../operators/random_crop_and_resize_op.cu | 1 + python/paddle/fluid/dataloader/__init__.py | 6 +- python/paddle/fluid/dataloader/ops.py | 66 +++++--- python/paddle/fluid/dataloader/pipeline.py | 2 + python/paddle/io/__init__.py | 4 +- 10 files changed, 284 insertions(+), 99 deletions(-) diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 3339c00dbe1031..1a5c94b3379542 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -18,29 +18,84 @@ namespace operators { using framework::Tensor; -class MapOp : public framework::OperatorWithKernel { +class MapOp : public framework::OperatorBase { public: - using framework::OperatorWithKernel::OperatorWithKernel; + // using framework::OperatorWithKernel::OperatorWithKernel; + MapOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutputs("X"), "Input", "X", "MapOp"); + void InferShape(framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInputs("In"), "Input", "In", "MapOp"); OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); } protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext& ctx) const { return framework::OpKernelType(framework::proto::VarType::FP32, ctx.GetPlace()); } - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const override { - return expected_kernel_type; + // framework::OpKernelType GetKernelTypeForVar( + // const std::string& var_name, const framework::Tensor& tensor, + // const framework::OpKernelType& expected_kernel_type) const override { + // return expected_kernel_type; + // } + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + LOG(ERROR) << "MapOpKernel RunImpl enter"; + // Step1: get output vars and attrs + auto input_var = scope.FindVar(Input("In")); + auto output_var = scope.FindVar(Output("Out")); + std::vector input_vars; + input_vars.reserve(1); + input_vars.emplace_back(input_var); + std::vector output_vars; + output_vars.reserve(1); + output_vars.emplace_back(output_var); + + CheckInputQueueStatus(input_vars); + CheckAndInitOutputQueue(output_vars, /*capacity=*/2); + + auto input_var_names = Attr>("input_var_names"); + auto output_var_names = Attr>("output_var_names"); + auto* map_block = Attr("map_block"); + auto start_op_index = Attr("start_op_index"); + auto end_op_index = Attr("end_op_index"); + auto program_id = Attr("program_id"); + LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); + for (auto var_name: map_block->LocalVarNames()) { + LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; + } + + auto input_queues = GetQueueVecFromVariableVec(input_vars); + auto output_queues = GetQueueVecFromVariableVec(output_vars); + data::MapRunnerManager::Instance()->StartMapRunner( + program_id, map_block, dev_place,start_op_index, end_op_index, + input_var_names, output_var_names, input_queues, output_queues, &scope); + LOG(ERROR) << "MapOpKernel RunImpl finish"; } }; +class MapInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("In"), "Input", "In", "MapOp"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); + } +}; + +class MapInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override {} +}; + + class MapOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -52,7 +107,7 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensorBlockingQueueHolder)" "The output tensors of Map operator") .AsDuplicable(); - AddAttr("global_block", + AddAttr("map_block", "(BlockDesc *)" "The global block of executed map program " "desc."); @@ -82,5 +137,6 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(map, ops::MapOp, ops::MapOpMaker); +REGISTER_OPERATOR(map, ops::MapOp, ops::MapOpMaker, + ops::MapInferShape, ops::MapInferVarType); REGISTER_OP_CPU_KERNEL(map, ops::MapOpKernel); diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index fbd4865e57a916..f254188ab0fd40 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -29,6 +29,7 @@ static void CheckInputQueueStatus(const std::vector& vars) { "Input Variables of MapOp should hold " "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); + LOG(ERROR) << "CheckAndInitOutputQueue get queue: " << queue; PADDLE_ENFORCE_NE(queue, nullptr, platform::errors::InvalidArgument( "Input LoDTensorBlockingQueue is not initialized")); @@ -38,14 +39,16 @@ static void CheckInputQueueStatus(const std::vector& vars) { static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { for (auto var : vars) { if (var->IsInitialized()) { + LOG(ERROR) << "CheckAndInitOutputQueue is LoDTensorBlockingQueueHolder: " << var->IsType(); PADDLE_ENFORCE_EQ(var->IsType(), true, platform::errors::InvalidArgument( "Output Variables of MapOp should hold " "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); - PADDLE_ENFORCE_NE(queue, nullptr, - platform::errors::InvalidArgument( - "Input LoDTensorBlockingQueue is not initialized")); + if (queue == nullptr) { + auto* holder = var->template GetMutable(); + holder->InitOnce(2); + } } else { // VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; LOG(ERROR) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; @@ -68,25 +71,31 @@ template class MapOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - // Step1: get output vars and attrs - auto input_vars = ctx.MultiInputVar("X"); - auto output_vars = ctx.MultiOutputVar("Out"); - - CheckInputQueueStatus(input_vars); - CheckAndInitOutputQueue(output_vars, /*capacity=*/2); - - auto input_var_names = ctx.Attr>("input_var_names"); - auto output_var_names = ctx.Attr>("output__var_names"); - auto* global_block = ctx.Attr("global_block"); - auto start_op_index = ctx.Attr("start_op_index"); - auto end_op_index = ctx.Attr("end_op_index"); - auto program_id = ctx.Attr("program_id"); - - auto input_queues = GetQueueVecFromVariableVec(input_vars); - auto output_queues = GetQueueVecFromVariableVec(output_vars); - data::MapRunnerManager::Instance()->StartMapRunner( - program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, - input_var_names, output_var_names, input_queues, output_queues); + LOG(ERROR) << "MapOpKernel enter"; + // // Step1: get output vars and attrs + // auto input_vars = ctx.MultiInputVar("In"); + // auto output_vars = ctx.MultiOutputVar("Out"); + // + // CheckInputQueueStatus(input_vars); + // CheckAndInitOutputQueue(output_vars, /*capacity=*/2); + // + // auto input_var_names = ctx.Attr>("input_var_names"); + // auto output_var_names = ctx.Attr>("output_var_names"); + // auto* map_block = ctx.Attr("map_block"); + // auto start_op_index = ctx.Attr("start_op_index"); + // auto end_op_index = ctx.Attr("end_op_index"); + // auto program_id = ctx.Attr("program_id"); + // LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); + // for (auto var_name: map_block->LocalVarNames()) { + // LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; + // } + // + // auto input_queues = GetQueueVecFromVariableVec(input_vars); + // auto output_queues = GetQueueVecFromVariableVec(output_vars); + // data::MapRunnerManager::Instance()->StartMapRunner( + // program_id, map_block, ctx.GetPlace(), start_op_index, end_op_index, + // input_var_names, output_var_names, input_queues, output_queues); + // LOG(ERROR) << "MapOpKernel finish"; } }; diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 07875b5b6fe10b..062bda4216e659 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -17,16 +17,17 @@ namespace operators { namespace data { MapRunner::MapRunner( - const std::shared_ptr global_block, + const std::shared_ptr map_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> input_queues, - const std::vector> output_queues) + const std::vector> output_queues, + const Scope* scope) : thread_pool_(1), running_(true), - global_block_(global_block), + map_block_(map_block), place_(place), start_op_index_(start_op_index), end_op_index_(end_op_index), @@ -35,6 +36,7 @@ MapRunner::MapRunner( output_var_names_(output_var_names), input_queues_(input_queues), output_queues_(output_queues) { + // scope_(scope) { VLOG(1) << "MapRunner init"; @@ -56,49 +58,87 @@ MapRunner::MapRunner( output_var_names_.size(), output_var_names_.size())); - // Step1: prepare executor - auto *program = global_block_->Program(); - auto cache_info = framework::GetExecutorInfoFromCache( - *program, place_, start_op_index_, end_op_index_, - /*is_grad=*/false, program_id, &scope_); - auto ¶llel_executor = cache_info.first; - - // Step2: parset persistable variables - auto &skip_eager_delete_vars = - framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, /*is_grad=*/false); - if (cache_info.second /*is_new_created*/) { - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - output_var_names.begin(), - output_var_names.end()); - framework::details::ParseSafeEagerDeletionSkipVars( - *program, end_op_index, output_var_names, &skip_eager_delete_vars); - } - - // Step3: start prefetch thread - StartMapThread(parallel_executor, skip_eager_delete_vars); + // // Step1: prepare executor + // auto *program = map_block_->Program(); + // auto cache_info = framework::GetExecutorInfoFromCache( + // *program, place_, start_op_index_, end_op_index_, + // /*is_grad=*/false, program_id, &scope_); + // auto ¶llel_executor = cache_info.first; + // + // // Step2: parset persistable variables + // auto &skip_eager_delete_vars = + // framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + // program_id, /*is_grad=*/false); + // if (cache_info.second /*is_new_created*/) { + // skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + // output_var_names.begin(), + // output_var_names.end()); + // framework::details::ParseSafeEagerDeletionSkipVars( + // *program, end_op_index, output_var_names, &skip_eager_delete_vars); + // } + // + // // Step3: start prefetch thread + // StartMapThread(parallel_executor, skip_eager_delete_vars); + StartMapThread(scope); } -bool MapRunner::ShareInputsIntoScope() { +// bool MapRunner::ShareInputsIntoScope() { +// for (size_t i = 0; i < input_queues_.size(); i++) { +// // If input queue closed, namely EOE(end of epoch) from +// // dataset reader to here, read failed +// auto queue = input_queues_[i]; +// if (queue->IsClosed()) return false; +// // LOG(ERROR) << "ShareInputsIntoScope " << i << ", queue: " << queue; +// +// // read LoDTensorArray +// bool success = true; +// auto lod_tensor_arr = queue->Pop(&success); +// // LOG(ERROR) << "ShareInputsIntoScope Pop success: " << success << ", tensor: " << lod_tensor_arr.size(); +// if (!success) return false; +// +// // read LoDTensor +// auto tensor = lod_tensor_arr[0]; +// if(!tensor.IsInitialized()) return false; +// // LOG(ERROR) << "ShareInputsIntoScope read LoDTensor success"; +// +// // get input variable from scope and check status +// auto name = input_var_names_[i]; +// auto* var = scope_.Var(name); +// // LOG(ERROR) << "ShareInputsIntoScope input var: " << var << ", IsInitialized: " << var->IsInitialized() << ", is LoDTensor: " << var->IsType(); +// // if (!var->IsType() || !var->IsInitialized()) return false; +// +// // share input tensor to variable +// auto* dst_tensor = var->GetMutable(); +// dst_tensor->ShareDataWith(tensor); +// dst_tensor->set_lod(tensor.lod()); +// } +// return true; +// } + +bool MapRunner::ShareInputsIntoScope(Scope* scope) { for (size_t i = 0; i < input_queues_.size(); i++) { // If input queue closed, namely EOE(end of epoch) from // dataset reader to here, read failed auto queue = input_queues_[i]; if (queue->IsClosed()) return false; + // LOG(ERROR) << "ShareInputsIntoScope " << i << ", queue: " << queue; // read LoDTensorArray bool success = true; auto lod_tensor_arr = queue->Pop(&success); + // LOG(ERROR) << "ShareInputsIntoScope Pop success: " << success << ", tensor: " << lod_tensor_arr.size(); if (!success) return false; // read LoDTensor auto tensor = lod_tensor_arr[0]; if(!tensor.IsInitialized()) return false; + // LOG(ERROR) << "ShareInputsIntoScope read LoDTensor success"; // get input variable from scope and check status auto name = input_var_names_[i]; - auto* var = scope_.Var(name); - if (!var->IsType() || !var->IsInitialized()) return false; + auto* var = scope->Var(name); + // LOG(ERROR) << "ShareInputsIntoScope input var: " << var << ", IsInitialized: " << var->IsInitialized() << ", is LoDTensor: " << var->IsType(); + // if (!var->IsType() || !var->IsInitialized()) return false; // share input tensor to variable auto* dst_tensor = var->GetMutable(); @@ -108,19 +148,57 @@ bool MapRunner::ShareInputsIntoScope() { return true; } -void MapRunner::StartMapThread(std::shared_ptr executor, - const std::vector &skip_vars) { - thread_pool_.enqueue([this, executor, skip_vars]() -> void { +// void MapRunner::StartMapThread(std::shared_ptr executor, +// const std::vector &skip_vars) { +// thread_pool_.enqueue([this, executor, skip_vars]() -> void { +// while (running_.load()) { +// LOG(ERROR) << "StartMapThread enter"; +// // Step1: get input LoDTensor and share into Scope +// bool success = ShareInputsIntoScope(); +// if (!success) { +// Shutdown(); +// break; +// } +// LOG(ERROR) << "ShareInputsIntoScope success"; +// +// LOG(ERROR) << "MapRunner RunWithoutFetch start"; +// // Step2: run ops by executor without fetch +// executor->RunWithoutFetch(skip_vars); +// LOG(ERROR) << "MapRunner RunWithoutFetch success"; +// +// // Step3: fetch output variable to LoDTensor vector +// // and push to output queue +// for (size_t i = 0; i < output_var_names_.size(); i++) { +// framework::LoDTensorArray t_arr(1); +// auto *out_var = scope_.FindVar(output_var_names_[i]); +// LOG(ERROR) << "scope FindVar " << output_var_names_[i] << ", var: " << out_var; +// PADDLE_ENFORCE_NOT_NULL( +// out_var, platform::errors::NotFound( +// "The output variable %s is not found in DataLoader " +// "program's internal scope", +// output_var_names_[i])); +// CheckOutputVarStatus(*out_var, output_var_names_[i]); +// copy_tensor(out_var->Get(), &t_arr[0]); +// output_queues_[i]->Push(t_arr); +// } +// } +// }); +// } + +void MapRunner::StartMapThread(const Scope* scope) { + thread_pool_.enqueue([this, scope]() -> void { + auto& scope_ = scope->NewScope(); + framework::Executor executor(place_); while (running_.load()) { // Step1: get input LoDTensor and share into Scope - bool success = ShareInputsIntoScope(); + bool success = ShareInputsIntoScope(&scope_); if (!success) { Shutdown(); break; } // Step2: run ops by executor without fetch - executor->RunWithoutFetch(skip_vars); + executor.Run(*map_block_->Program(), &scope_, map_block_->ID(), false, true, std::vector(), false, true); // Step3: fetch output variable to LoDTensor vector // and push to output queue diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 9139158b994c87..1aa0e657f24451 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -30,18 +30,20 @@ using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; namespace data { class MapRunner { public: - MapRunner(const std::shared_ptr global_block, + MapRunner(const std::shared_ptr map_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> input_queues, - const std::vector> output_queues); + const std::vector> output_queues, + const Scope* scope); // ~MapRunner() { // VLOG(1) << "~MapRunner"; @@ -62,9 +64,12 @@ class MapRunner { out_tensor.set_lod(lod_tensor.lod()); } - bool ShareInputsIntoScope(); - void StartMapThread(std::shared_ptr executor, - const std::vector &skip_vars); + // bool ShareInputsIntoScope(); + bool ShareInputsIntoScope(Scope* scope); + + void StartMapThread(const Scope* scope); + // void StartMapThread(std::shared_ptr executor, + // const std::vector &skip_vars); void CheckInputVarStatus(const Variable &var, const std::string &var_name); void CheckOutputVarStatus(const Variable &var, const std::string &var_name); @@ -72,8 +77,7 @@ class MapRunner { ThreadPool thread_pool_; std::atomic running_; - Scope scope_; - std::shared_ptr global_block_; + std::shared_ptr map_block_; platform::Place place_; int64_t start_op_index_; int64_t end_op_index_; @@ -83,6 +87,8 @@ class MapRunner { std::vector output_var_names_; std::vector> input_queues_; std::vector> output_queues_; + + // Scope scope_; }; class MapRunnerManager { @@ -108,18 +114,19 @@ class MapRunnerManager { } void StartMapRunner( - int64_t program_id, BlockDesc *global_block, const platform::Place &place, + int64_t program_id, BlockDesc *map_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> &input_queues, - const std::vector> &output_queues) { + const std::vector> &output_queues, + const Scope* scope) { auto iter = prog_id_to_runner_.find(program_id); if (iter == prog_id_to_runner_.end()) { prog_id_to_runner_[program_id] = std::unique_ptr(new MapRunner( - std::shared_ptr(global_block), place, start_op_index, + std::shared_ptr(map_block), place, start_op_index, end_op_index, program_id, input_var_names, output_var_names, - input_queues, output_queues)); + input_queues, output_queues, scope)); } } diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 82fa06df714531..0c7b1b60f5dfd2 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -85,8 +85,10 @@ void Pipeline::StartPrefetchThread(std::shared_ptr executor, // CheckOutputVarStatus(*out_var, output_var_names_[i]); // copy_tensor(out_var->Get(), &t_arr[i]); auto out_queue = out_var->Get().GetQueue(); + LOG(ERROR) << "Executor out var: " << output_var_names_[i] << ", out_queue: " << out_queue; bool success = true; auto outputs = out_queue->Pop(&success); + LOG(ERROR) << "Executor get outputs from queue, success: " << success << ", outputs: " << outputs.size(); ; PADDLE_ENFORCE_EQ(success, true, platform::errors::PreconditionNotMet("Read from input queue failed")); copy_tensor(outputs.at(0), &t_arr[i]); diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu index 684de9c9ffe9e8..d5b22924259ac5 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -298,6 +298,7 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { holder->InitOnce(2); out_queue = holder->GetQueue(); } + LOG(ERROR) << "crop resize out_var: " << out_var << "out_queue: " << out_queue; bool success = false; auto x = in_queue->Pop(&success); diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/fluid/dataloader/__init__.py index 9ecfff2f7dadc9..5ad110908fab4f 100644 --- a/python/paddle/fluid/dataloader/__init__.py +++ b/python/paddle/fluid/dataloader/__init__.py @@ -29,8 +29,12 @@ from . import pipeline from .pipeline import * +from . import ops +from .ops import * + __all__ = dataset.__all__ \ + batch_sampler.__all__ \ + dataloader_iter.__all__ \ + sampler.__all__ \ - + pipeline.__all__ + + pipeline.__all__ \ + + ops.__all__ diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index d180c396698b25..b1c7ee13a7decd 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -15,55 +15,79 @@ from __future__ import print_function import paddle -import paddle.fluid as fluid -import paddle.static as static -from paddle.fluid import core, framework -from paddle.fluid.layers.utils import _hash_with_id -from paddle.common_ops_import import * +from ...fluid import core, framework, Program, program_guard, unique_name +from ...fluid.layers.utils import _hash_with_id +from ...common_ops_import import * __all__ = ["map"] +def _to_list(l): + if isinstance(l, (list, tuple)): + return l + return [l] + + +class MapGuard(object): + def __init__(self, main_program): + if not isinstance(main_program, Program): + raise TypeError("MapGuard should init with a Program") + self._main_program = main_program + + def __enter__(self): + self._main_program._create_block() + + def __exit__(self, exc_type, exc_val, exc_tb): + self._main_program._rollback() + return exc_type is None + + def map(map_func, inputs): assert not in_dygraph_mode(), \ "paddle.io.map can only be used in static mode" helper = LayerHelper("map", **locals()) - # inputs are Variables hold LoDTensorBlockingQueue - # TODO: cannot get tensor shape from LoDTensorBlockingQueue - program_inputs = [static.data('input_{}'.format(i), [None]) for i in range(len(inputs))] + # build map block + main_program = helper.main_program + with MapGuard(main_program): + map_block = main_program.current_block() - # build map program - main_program = fluid.Program() - startup_program = fluid.Program() - with static.guard(main_program, startup_program): + inputs = _to_list(inputs) + program_inputs = [ + map_program.create_var( + name=unique_name.generate("map_sub"), + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False) for i in range(len(inputs))] program_outputs = map_func(*program_inputs) + program_outputs = _to_list(program_outputs) - input_var_names = [v.name for v in program_inputs] - output_var_names = [v.name for v in program_outputs] + input_var_names = [v.name for v in program_inputs] + output_var_names = [v.name for v in program_outputs] - global_block = self._main_program.desc.block(0) - program_id = _hash_with_id(main_program, map_func) + program_id = _hash_with_id(map_program) + start_op_index = 0 + end_op_index = map_block.desc.op_size() outputs = \ [helper.create_variable( name=unique_name.generate("map"), - type=core.VarDesc.VarType.LOD_TENSOR_BLOCKING_QUEUE, + type=core.VarDesc.VarType.LOD_TENSOR, persistable=True) for _ in range(len(program_outputs))] attrs = { - "global_block": global_block, + "map_block": map_block, "program_id": program_id, - "start_op_index": 0, - "end_op_index": global_block.op_size(), + "start_op_index": start_op_index, + "end_op_index": end_op_index, "input_var_names": input_var_names, "output_var_names": output_var_names } + print("atttrs:", attrs) helper.append_op( type="map", - inputs={"X": inputs}, + inputs={"In": inputs}, outputs={"Out": outputs}, attrs=attrs) diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index a7011647c94127..2e440f0e82807d 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -85,6 +85,8 @@ def build(self): def _prepare_output_vars(self): output_vars = [] for var in self._out_vars: + if isinstance(var, (list, tuple)): + var = var[0] assert isinstance(var, framework.Variable), \ "output of DataLoader program should be Variable" var_desc = var.desc diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index b267b09925d4c0..57fb31a723817d 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -30,6 +30,7 @@ from ..fluid.dataloader import WeightedRandomSampler # noqa: F401 from ..fluid.dataloader import Subset # noqa: F401 from ..fluid.dataloader import random_split # noqa: F401 +from ..fluid.dataloader import map # noqa: F401 __all__ = [ #noqa 'Dataset', @@ -47,5 +48,6 @@ 'RandomSampler', 'WeightedRandomSampler', 'random_split', - 'Subset' + 'Subset', + 'map' ] From b14d92af1d39c2be0d2ecb70a52f80ed6c1a7f06 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 6 Dec 2021 12:44:16 +0000 Subject: [PATCH 23/95] fix typo and clean log --- paddle/fluid/operators/data/dataloader_op.h | 3 +-- paddle/fluid/operators/data/map_op.cc | 8 ++++---- paddle/fluid/operators/data/map_op.h | 4 +--- python/paddle/fluid/dataloader/ops.py | 4 ++-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 4e5c1b2541d07a..611c720e2d1539 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -38,9 +38,8 @@ class DataLoaderOpKernel : public framework::OpKernel { program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, output_var_names, prefetch_depth); - LOG(ERROR) << "Get Pipeline finsih"; pipeline->ReadNext(output_vars); - LOG(ERROR) << "ReadNext finish"; + LOG(ERROR) << "DataLoaderOpKernel finish"; } }; diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 1a5c94b3379542..2fa6ff59a41d1a 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -68,10 +68,10 @@ class MapOp : public framework::OperatorBase { auto start_op_index = Attr("start_op_index"); auto end_op_index = Attr("end_op_index"); auto program_id = Attr("program_id"); - LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); - for (auto var_name: map_block->LocalVarNames()) { - LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; - } + // LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); + // for (auto var_name: map_block->LocalVarNames()) { + // LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; + // } auto input_queues = GetQueueVecFromVariableVec(input_vars); auto output_queues = GetQueueVecFromVariableVec(output_vars); diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index f254188ab0fd40..b9ca258a7b8b74 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -29,7 +29,6 @@ static void CheckInputQueueStatus(const std::vector& vars) { "Input Variables of MapOp should hold " "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); - LOG(ERROR) << "CheckAndInitOutputQueue get queue: " << queue; PADDLE_ENFORCE_NE(queue, nullptr, platform::errors::InvalidArgument( "Input LoDTensorBlockingQueue is not initialized")); @@ -39,7 +38,6 @@ static void CheckInputQueueStatus(const std::vector& vars) { static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { for (auto var : vars) { if (var->IsInitialized()) { - LOG(ERROR) << "CheckAndInitOutputQueue is LoDTensorBlockingQueueHolder: " << var->IsType(); PADDLE_ENFORCE_EQ(var->IsType(), true, platform::errors::InvalidArgument( "Output Variables of MapOp should hold " @@ -71,7 +69,7 @@ template class MapOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "MapOpKernel enter"; + // LOG(ERROR) << "MapOpKernel enter"; // // Step1: get output vars and attrs // auto input_vars = ctx.MultiInputVar("In"); // auto output_vars = ctx.MultiOutputVar("Out"); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index b1c7ee13a7decd..f1ce42a17be14d 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -56,7 +56,7 @@ def map(map_func, inputs): inputs = _to_list(inputs) program_inputs = [ - map_program.create_var( + map_block.create_var( name=unique_name.generate("map_sub"), type=core.VarDesc.VarType.LOD_TENSOR, persistable=False) for i in range(len(inputs))] @@ -66,7 +66,7 @@ def map(map_func, inputs): input_var_names = [v.name for v in program_inputs] output_var_names = [v.name for v in program_outputs] - program_id = _hash_with_id(map_program) + program_id = _hash_with_id(main_program) start_op_index = 0 end_op_index = map_block.desc.op_size() From acc731d6055cc1e0fc30c7bbad0a31538577b6e4 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 7 Dec 2021 07:01:42 +0000 Subject: [PATCH 24/95] polish code --- paddle/fluid/operators/data/map_op.cc | 23 ++-- paddle/fluid/operators/data/map_op.h | 33 +---- paddle/fluid/operators/data/map_runner.cc | 115 ++---------------- paddle/fluid/operators/data/map_runner.h | 38 +++--- paddle/fluid/operators/data/pipeline.cc | 2 - .../operators/random_crop_and_resize_op.cu | 4 +- python/paddle/fluid/dataloader/ops.py | 8 +- 7 files changed, 40 insertions(+), 183 deletions(-) diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 2fa6ff59a41d1a..4ce6188db8d1b0 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -65,19 +65,14 @@ class MapOp : public framework::OperatorBase { auto input_var_names = Attr>("input_var_names"); auto output_var_names = Attr>("output_var_names"); auto* map_block = Attr("map_block"); - auto start_op_index = Attr("start_op_index"); - auto end_op_index = Attr("end_op_index"); auto program_id = Attr("program_id"); - // LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); - // for (auto var_name: map_block->LocalVarNames()) { - // LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; - // } auto input_queues = GetQueueVecFromVariableVec(input_vars); auto output_queues = GetQueueVecFromVariableVec(output_vars); data::MapRunnerManager::Instance()->StartMapRunner( - program_id, map_block, dev_place,start_op_index, end_op_index, - input_var_names, output_var_names, input_queues, output_queues, &scope); + map_block, program_id, &scope, dev_place, + input_var_names, output_var_names, + input_queues, output_queues); LOG(ERROR) << "MapOpKernel RunImpl finish"; } }; @@ -111,12 +106,12 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "(BlockDesc *)" "The global block of executed map program " "desc."); - AddAttr("start_op_index", - "(int64_t)" - "The index of the op to start execution"); - AddAttr("end_op_index", - "(int64_t)" - "The index of the op to stop execution"); + // AddAttr("start_op_index", + // "(int64_t)" + // "The index of the op to start execution"); + // AddAttr("end_op_index", + // "(int64_t)" + // "The index of the op to stop execution"); AddAttr("program_id", "(int64_t)" "The unique hash id used as cache key for " diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index b9ca258a7b8b74..2606b1dd983125 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -45,11 +45,10 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa auto queue = var->Get().GetQueue(); if (queue == nullptr) { auto* holder = var->template GetMutable(); - holder->InitOnce(2); + holder->InitOnce(capacity); } } else { - // VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; - LOG(ERROR) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; + VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; auto* holder = var->GetMutable(); holder->InitOnce(capacity); } @@ -68,33 +67,7 @@ static std::vector> GetQueueVecFromVaria template class MapOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - // LOG(ERROR) << "MapOpKernel enter"; - // // Step1: get output vars and attrs - // auto input_vars = ctx.MultiInputVar("In"); - // auto output_vars = ctx.MultiOutputVar("Out"); - // - // CheckInputQueueStatus(input_vars); - // CheckAndInitOutputQueue(output_vars, /*capacity=*/2); - // - // auto input_var_names = ctx.Attr>("input_var_names"); - // auto output_var_names = ctx.Attr>("output_var_names"); - // auto* map_block = ctx.Attr("map_block"); - // auto start_op_index = ctx.Attr("start_op_index"); - // auto end_op_index = ctx.Attr("end_op_index"); - // auto program_id = ctx.Attr("program_id"); - // LOG(ERROR) << "MapOpKernel block id: " << map_block->ID(); - // for (auto var_name: map_block->LocalVarNames()) { - // LOG(ERROR) << "MapOpKernel map_block vars: " << var_name; - // } - // - // auto input_queues = GetQueueVecFromVariableVec(input_vars); - // auto output_queues = GetQueueVecFromVariableVec(output_vars); - // data::MapRunnerManager::Instance()->StartMapRunner( - // program_id, map_block, ctx.GetPlace(), start_op_index, end_op_index, - // input_var_names, output_var_names, input_queues, output_queues); - // LOG(ERROR) << "MapOpKernel finish"; - } + void Compute(const framework::ExecutionContext& ctx) const override {} }; } // namespace operators diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 062bda4216e659..2d7291a526454d 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -18,33 +18,29 @@ namespace data { MapRunner::MapRunner( const std::shared_ptr map_block, - const platform::Place &place, int64_t start_op_index, - int64_t end_op_index, int64_t program_id, + const int64_t program_id, + const Scope* scope, + const platform::Place &place, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> input_queues, - const std::vector> output_queues, - const Scope* scope) + const std::vector> output_queues) : thread_pool_(1), running_(true), map_block_(map_block), - place_(place), - start_op_index_(start_op_index), - end_op_index_(end_op_index), program_id_(program_id), + place_(place), input_var_names_(input_var_names), output_var_names_(output_var_names), input_queues_(input_queues), output_queues_(output_queues) { - // scope_(scope) { - VLOG(1) << "MapRunner init"; - PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, - platform::errors::InvalidArgument( - "end_op_index should be greater than start_op_index, " - "but recieve %d <= %d.", - end_op_index_, start_op_index_)); + // PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, + // platform::errors::InvalidArgument( + // "end_op_index should be greater than start_op_index, " + // "but recieve %d <= %d.", + // end_op_index_, start_op_index_)); PADDLE_ENFORCE_EQ(input_var_names_.size(), input_queues_.size(), platform::errors::InvalidArgument( "input_var_names length should be equal to input_queues length, " @@ -58,63 +54,9 @@ MapRunner::MapRunner( output_var_names_.size(), output_var_names_.size())); - // // Step1: prepare executor - // auto *program = map_block_->Program(); - // auto cache_info = framework::GetExecutorInfoFromCache( - // *program, place_, start_op_index_, end_op_index_, - // /*is_grad=*/false, program_id, &scope_); - // auto ¶llel_executor = cache_info.first; - // - // // Step2: parset persistable variables - // auto &skip_eager_delete_vars = - // framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - // program_id, /*is_grad=*/false); - // if (cache_info.second /*is_new_created*/) { - // skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - // output_var_names.begin(), - // output_var_names.end()); - // framework::details::ParseSafeEagerDeletionSkipVars( - // *program, end_op_index, output_var_names, &skip_eager_delete_vars); - // } - // - // // Step3: start prefetch thread - // StartMapThread(parallel_executor, skip_eager_delete_vars); StartMapThread(scope); } -// bool MapRunner::ShareInputsIntoScope() { -// for (size_t i = 0; i < input_queues_.size(); i++) { -// // If input queue closed, namely EOE(end of epoch) from -// // dataset reader to here, read failed -// auto queue = input_queues_[i]; -// if (queue->IsClosed()) return false; -// // LOG(ERROR) << "ShareInputsIntoScope " << i << ", queue: " << queue; -// -// // read LoDTensorArray -// bool success = true; -// auto lod_tensor_arr = queue->Pop(&success); -// // LOG(ERROR) << "ShareInputsIntoScope Pop success: " << success << ", tensor: " << lod_tensor_arr.size(); -// if (!success) return false; -// -// // read LoDTensor -// auto tensor = lod_tensor_arr[0]; -// if(!tensor.IsInitialized()) return false; -// // LOG(ERROR) << "ShareInputsIntoScope read LoDTensor success"; -// -// // get input variable from scope and check status -// auto name = input_var_names_[i]; -// auto* var = scope_.Var(name); -// // LOG(ERROR) << "ShareInputsIntoScope input var: " << var << ", IsInitialized: " << var->IsInitialized() << ", is LoDTensor: " << var->IsType(); -// // if (!var->IsType() || !var->IsInitialized()) return false; -// -// // share input tensor to variable -// auto* dst_tensor = var->GetMutable(); -// dst_tensor->ShareDataWith(tensor); -// dst_tensor->set_lod(tensor.lod()); -// } -// return true; -// } - bool MapRunner::ShareInputsIntoScope(Scope* scope) { for (size_t i = 0; i < input_queues_.size(); i++) { // If input queue closed, namely EOE(end of epoch) from @@ -148,43 +90,6 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { return true; } -// void MapRunner::StartMapThread(std::shared_ptr executor, -// const std::vector &skip_vars) { -// thread_pool_.enqueue([this, executor, skip_vars]() -> void { -// while (running_.load()) { -// LOG(ERROR) << "StartMapThread enter"; -// // Step1: get input LoDTensor and share into Scope -// bool success = ShareInputsIntoScope(); -// if (!success) { -// Shutdown(); -// break; -// } -// LOG(ERROR) << "ShareInputsIntoScope success"; -// -// LOG(ERROR) << "MapRunner RunWithoutFetch start"; -// // Step2: run ops by executor without fetch -// executor->RunWithoutFetch(skip_vars); -// LOG(ERROR) << "MapRunner RunWithoutFetch success"; -// -// // Step3: fetch output variable to LoDTensor vector -// // and push to output queue -// for (size_t i = 0; i < output_var_names_.size(); i++) { -// framework::LoDTensorArray t_arr(1); -// auto *out_var = scope_.FindVar(output_var_names_[i]); -// LOG(ERROR) << "scope FindVar " << output_var_names_[i] << ", var: " << out_var; -// PADDLE_ENFORCE_NOT_NULL( -// out_var, platform::errors::NotFound( -// "The output variable %s is not found in DataLoader " -// "program's internal scope", -// output_var_names_[i])); -// CheckOutputVarStatus(*out_var, output_var_names_[i]); -// copy_tensor(out_var->Get(), &t_arr[0]); -// output_queues_[i]->Push(t_arr); -// } -// } -// }); -// } - void MapRunner::StartMapThread(const Scope* scope) { thread_pool_.enqueue([this, scope]() -> void { auto& scope_ = scope->NewScope(); diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 1aa0e657f24451..333b4017ca2488 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -37,13 +37,16 @@ namespace data { class MapRunner { public: MapRunner(const std::shared_ptr map_block, - const platform::Place &place, int64_t start_op_index, - int64_t end_op_index, int64_t program_id, - const std::vector &input_var_names, - const std::vector &output_var_names, - const std::vector> input_queues, - const std::vector> output_queues, - const Scope* scope); + const int64_t program_id, + const Scope* scope, + const platform::Place &place, + // int64_t start_op_index, + // int64_t end_op_index, + // int64_t program_id, + const std::vector &input_var_names, + const std::vector &output_var_names, + const std::vector> input_queues, + const std::vector> output_queues); // ~MapRunner() { // VLOG(1) << "~MapRunner"; @@ -64,12 +67,9 @@ class MapRunner { out_tensor.set_lod(lod_tensor.lod()); } - // bool ShareInputsIntoScope(); bool ShareInputsIntoScope(Scope* scope); void StartMapThread(const Scope* scope); - // void StartMapThread(std::shared_ptr executor, - // const std::vector &skip_vars); void CheckInputVarStatus(const Variable &var, const std::string &var_name); void CheckOutputVarStatus(const Variable &var, const std::string &var_name); @@ -78,17 +78,13 @@ class MapRunner { std::atomic running_; std::shared_ptr map_block_; - platform::Place place_; - int64_t start_op_index_; - int64_t end_op_index_; int64_t program_id_; + platform::Place place_; std::vector input_var_names_; std::vector output_var_names_; std::vector> input_queues_; std::vector> output_queues_; - - // Scope scope_; }; class MapRunnerManager { @@ -114,19 +110,17 @@ class MapRunnerManager { } void StartMapRunner( - int64_t program_id, BlockDesc *map_block, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, + BlockDesc *map_block, const int64_t program_id, + const Scope* scope, const platform::Place &place, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> &input_queues, - const std::vector> &output_queues, - const Scope* scope) { + const std::vector> &output_queues) { auto iter = prog_id_to_runner_.find(program_id); if (iter == prog_id_to_runner_.end()) { prog_id_to_runner_[program_id] = std::unique_ptr(new MapRunner( - std::shared_ptr(map_block), place, start_op_index, - end_op_index, program_id, input_var_names, output_var_names, - input_queues, output_queues, scope)); + std::shared_ptr(map_block), program_id, scope, place, + input_var_names, output_var_names, input_queues, output_queues)); } } diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 0c7b1b60f5dfd2..82fa06df714531 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -85,10 +85,8 @@ void Pipeline::StartPrefetchThread(std::shared_ptr executor, // CheckOutputVarStatus(*out_var, output_var_names_[i]); // copy_tensor(out_var->Get(), &t_arr[i]); auto out_queue = out_var->Get().GetQueue(); - LOG(ERROR) << "Executor out var: " << output_var_names_[i] << ", out_queue: " << out_queue; bool success = true; auto outputs = out_queue->Pop(&success); - LOG(ERROR) << "Executor get outputs from queue, success: " << success << ", outputs: " << outputs.size(); ; PADDLE_ENFORCE_EQ(success, true, platform::errors::PreconditionNotMet("Read from input queue failed")); copy_tensor(outputs.at(0), &t_arr[i]); diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu index d5b22924259ac5..c9bc1a3a6f6cae 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -288,17 +288,15 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { auto* in_var = ctx.InputVar("X"); auto in_queue = in_var->Get().GetQueue(); - LOG(ERROR) << "crop resize in_var: " << in_var << "in_queue: " << in_queue; auto* out_var = ctx.OutputVar("Out"); auto out_queue = out_var->Get().GetQueue(); if (out_queue == nullptr) { - LOG(ERROR) << "crop resize init output queue"; + LOG(ERROR) << "RandomCropAndResize out_queue init"; auto* holder = out_var->template GetMutable(); holder->InitOnce(2); out_queue = holder->GetQueue(); } - LOG(ERROR) << "crop resize out_var: " << out_var << "out_queue: " << out_queue; bool success = false; auto x = in_queue->Pop(&success); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index f1ce42a17be14d..f835be5a45309c 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -52,6 +52,7 @@ def map(map_func, inputs): # build map block main_program = helper.main_program with MapGuard(main_program): + program_id = _hash_with_id(main_program, map_func) map_block = main_program.current_block() inputs = _to_list(inputs) @@ -66,10 +67,6 @@ def map(map_func, inputs): input_var_names = [v.name for v in program_inputs] output_var_names = [v.name for v in program_outputs] - program_id = _hash_with_id(main_program) - start_op_index = 0 - end_op_index = map_block.desc.op_size() - outputs = \ [helper.create_variable( name=unique_name.generate("map"), @@ -78,12 +75,9 @@ def map(map_func, inputs): attrs = { "map_block": map_block, "program_id": program_id, - "start_op_index": start_op_index, - "end_op_index": end_op_index, "input_var_names": input_var_names, "output_var_names": output_var_names } - print("atttrs:", attrs) helper.append_op( type="map", From 78824a8c1e0a04ac5f64400dba8ee1bb3ccc54de Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 7 Dec 2021 07:50:07 +0000 Subject: [PATCH 25/95] polish code --- paddle/fluid/operators/data/map_op.cc | 12 ------------ paddle/fluid/operators/data/map_runner.cc | 5 ----- 2 files changed, 17 deletions(-) diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 4ce6188db8d1b0..7153ddd1b7787c 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -39,12 +39,6 @@ class MapOp : public framework::OperatorBase { ctx.GetPlace()); } - // framework::OpKernelType GetKernelTypeForVar( - // const std::string& var_name, const framework::Tensor& tensor, - // const framework::OpKernelType& expected_kernel_type) const override { - // return expected_kernel_type; - // } - private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { @@ -106,12 +100,6 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "(BlockDesc *)" "The global block of executed map program " "desc."); - // AddAttr("start_op_index", - // "(int64_t)" - // "The index of the op to start execution"); - // AddAttr("end_op_index", - // "(int64_t)" - // "The index of the op to stop execution"); AddAttr("program_id", "(int64_t)" "The unique hash id used as cache key for " diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 2d7291a526454d..c6838c06d65509 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -63,24 +63,19 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // dataset reader to here, read failed auto queue = input_queues_[i]; if (queue->IsClosed()) return false; - // LOG(ERROR) << "ShareInputsIntoScope " << i << ", queue: " << queue; // read LoDTensorArray bool success = true; auto lod_tensor_arr = queue->Pop(&success); - // LOG(ERROR) << "ShareInputsIntoScope Pop success: " << success << ", tensor: " << lod_tensor_arr.size(); if (!success) return false; // read LoDTensor auto tensor = lod_tensor_arr[0]; if(!tensor.IsInitialized()) return false; - // LOG(ERROR) << "ShareInputsIntoScope read LoDTensor success"; // get input variable from scope and check status auto name = input_var_names_[i]; auto* var = scope->Var(name); - // LOG(ERROR) << "ShareInputsIntoScope input var: " << var << ", IsInitialized: " << var->IsInitialized() << ", is LoDTensor: " << var->IsType(); - // if (!var->IsType() || !var->IsInitialized()) return false; // share input tensor to variable auto* dst_tensor = var->GetMutable(); From e9dd9ed8585f280da5d8fbfaae789b1fbb4689bc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 8 Dec 2021 12:29:06 +0000 Subject: [PATCH 26/95] queue + loop success --- .../fluid/framework/ir/data_io_queue_pass.cc | 97 ++++++++++----- .../fluid/operators/data/batch_decode_op.cu | 56 ++++----- paddle/fluid/operators/data/map_op.cc | 1 + paddle/fluid/operators/data/map_op.h | 1 + paddle/fluid/operators/data/map_runner.cc | 116 ++++++++++++------ paddle/fluid/operators/data/map_runner.h | 1 + .../fluid/operators/file_label_reader_op.cc | 75 +++++------ .../operators/random_crop_and_resize_op.cc | 2 - .../operators/random_crop_and_resize_op.cu | 70 +++++------ python/paddle/fluid/dataloader/ops.py | 5 +- 10 files changed, 247 insertions(+), 177 deletions(-) diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/data_io_queue_pass.cc index 8cfa5452647181..490020c25bdb2c 100644 --- a/paddle/fluid/framework/ir/data_io_queue_pass.cc +++ b/paddle/fluid/framework/ir/data_io_queue_pass.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include #include "glog/logging.h" @@ -24,42 +24,83 @@ namespace ir { class Graph; -static int MAX_VARS_LEN = 100; +std::set output_queue_holder_ops = { + "file_label_reader", + "map", +}; -class DataIOQueuePass: public Pass { - protected: - void ApplyImpl(ir::Graph *graph) const override { - // VLOG(3) << "Change inputs/outputs of data ops to queue"; - LOG(ERROR) << "Change inputs/outputs of data ops to queue"; - std::vector var_names; - var_names.reserve(MAX_VARS_LEN); - for (const Node *n : graph->Nodes()) { - if (n->IsOp() && n->Op()) { - auto *op = n->Op(); - if (op->Type() == "file_label_reader" - || op->Type() == "batch_decode" - || op->Type() == "random_crop_and_resize" - || op->Type() == "map") { - auto& outputs = op->Outputs(); - for (auto iter = outputs.begin(); iter != outputs.end(); iter++) { - auto vars = iter->second; - std::copy(vars.begin(), vars.end(), std::back_inserter(var_names)); - } +std::set input_array_ops = { + "random_crop_and_resize", + "batch_decode", +}; + +static bool IsOutputQueueHolderOp(std::string op_type) { + return output_queue_holder_ops.find(op_type) != output_queue_holder_ops.end(); +} + +static bool IsInputArrayOp(std::string op_type) { + return input_array_ops.find(op_type) != input_array_ops.end(); +} + +static void ProcessOutputQueueHolderOp(ir::Graph* graph) { + std::set var_names; + for (const Node *n : graph->Nodes()) { + if (n->IsOp() && n->Op()) { + auto *op = n->Op(); + if (IsOutputQueueHolderOp(op->Type())) { + auto& outputs = op->Outputs(); + for (auto iter = outputs.begin(); iter != outputs.end(); iter++) { + for (auto var: iter->second) var_names.insert(var); } } } + } + + for (const Node *n : graph->Nodes()) { + if (n->IsVar() && n->Var()) { + auto *var = n->Var(); + if (var_names.find(var->Name()) != var_names.end()) { + // VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; + LOG(ERROR) << "Change output variable type of " << var->Name() << " to queue holder"; + var->SetType(framework::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); + var->SetPersistable(true); + } + } + } +} - for (const Node *n : graph->Nodes()) { - if (n->IsVar() && n->Var()) { - auto *var = n->Var(); - auto iter = std::find(var_names.begin(), var_names.end(), var->Name()); - if (iter != var_names.end()) { - var->SetType(framework::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); - var->SetPersistable(true); +static void ProcessInputArrayOp(ir::Graph* graph) { + std::set var_names; + for (const Node *n : graph->Nodes()) { + if (n->IsOp() && n->Op()) { + auto *op = n->Op(); + if (IsInputArrayOp(op->Type())) { + auto& inputs = op->Inputs(); + for (auto iter = inputs.begin(); iter != inputs.end(); iter++) { + for (auto var: iter->second) var_names.insert(var); } } } } + + for (const Node *n : graph->Nodes()) { + if (n->IsVar() && n->Var()) { + auto *var = n->Var(); + if (var_names.find(var->Name()) != var_names.end()) { + // VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; + LOG(ERROR) << "Change input variable type of " << var->Name() << " to array"; + var->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); + } + } + } +} + +class DataIOQueuePass: public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override { + ProcessOutputQueueHolderOp(graph); + ProcessInputArrayOp(graph); + } }; } // namespace ir diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index 6e22423c93589f..b25b8b383f3ef8 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -39,38 +39,38 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { // multi-phrase decode thread pool if (!decode_pool) { - LOG(ERROR) << "decode_pool init"; + LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); } - // const framework::LoDTensorArray* inputs = - // ctx.Input("X"); - // - // auto* out = ctx.OutputVar("Out"); - // auto& out_array = *out->GetMutable(); - // out_array.resize(inputs->size()); + const framework::LoDTensorArray* inputs = + ctx.Input("X"); - auto* in_var = ctx.InputVar("X"); - auto in_queue = in_var->Get().GetQueue(); + auto* out = ctx.OutputVar("Out"); + auto& out_array = *out->GetMutable(); + out_array.resize(inputs->size()); - auto* out_var = ctx.OutputVar("Out"); - auto out_queue = out_var->Get().GetQueue(); - if (out_queue == nullptr) { - LOG(ERROR) << "decode init output queue"; - auto* holder = out_var->template GetMutable(); - holder->InitOnce(2); - out_queue = holder->GetQueue(); - } - - bool success = true; - auto inputs = in_queue->Pop(&success); - PADDLE_ENFORCE_EQ(success, true, - platform::errors::PreconditionNotMet("Read from input queue failed")); - framework::LoDTensorArray out_array; - out_array.resize(inputs.size()); - - for (size_t i = 0; i < inputs.size(); i++) { - const framework::LoDTensor x = inputs.at(i); + // auto* in_var = ctx.InputVar("X"); + // auto in_queue = in_var->Get().GetQueue(); + // + // auto* out_var = ctx.OutputVar("Out"); + // auto out_queue = out_var->Get().GetQueue(); + // if (out_queue == nullptr) { + // LOG(ERROR) << "decode init output queue"; + // auto* holder = out_var->template GetMutable(); + // holder->InitOnce(2); + // out_queue = holder->GetQueue(); + // } + // + // bool success = true; + // auto inputs = in_queue->Pop(&success); + // PADDLE_ENFORCE_EQ(success, true, + // platform::errors::PreconditionNotMet("Read from input queue failed")); + // framework::LoDTensorArray out_array; + // out_array.resize(inputs.size()); + + for (size_t i = 0; i < inputs->size(); i++) { + const framework::LoDTensor x = inputs->at(i); auto* x_data = x.data(); size_t x_numel = static_cast(x.numel()); @@ -84,7 +84,7 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { } decode_pool->RunAll(true); - out_queue->Push(out_array); + // out_queue->Push(out_array); // // multi-phrase decode single thread // if (!nvjpeg_decoder) { diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 7153ddd1b7787c..622dbbac852615 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -44,6 +44,7 @@ class MapOp : public framework::OperatorBase { const platform::Place& dev_place) const override { LOG(ERROR) << "MapOpKernel RunImpl enter"; // Step1: get output vars and attrs + // FIXME(dkp): multi input support auto input_var = scope.FindVar(Input("In")); auto output_var = scope.FindVar(Output("Out")); std::vector input_vars; diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index 2606b1dd983125..b52c060c223bd9 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -44,6 +44,7 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); if (queue == nullptr) { + LOG(ERROR) << "MapOpKernel init queue"; auto* holder = var->template GetMutable(); holder->InitOnce(capacity); } diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index c6838c06d65509..c4f4c3f53bd019 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -36,11 +36,6 @@ MapRunner::MapRunner( output_queues_(output_queues) { VLOG(1) << "MapRunner init"; - // PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, - // platform::errors::InvalidArgument( - // "end_op_index should be greater than start_op_index, " - // "but recieve %d <= %d.", - // end_op_index_, start_op_index_)); PADDLE_ENFORCE_EQ(input_var_names_.size(), input_queues_.size(), platform::errors::InvalidArgument( "input_var_names length should be equal to input_queues length, " @@ -64,23 +59,50 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { auto queue = input_queues_[i]; if (queue->IsClosed()) return false; - // read LoDTensorArray + // read LoDTensorArray from queue bool success = true; - auto lod_tensor_arr = queue->Pop(&success); + auto tensor_arr = queue->Pop(&success); if (!success) return false; - // read LoDTensor - auto tensor = lod_tensor_arr[0]; - if(!tensor.IsInitialized()) return false; - - // get input variable from scope and check status - auto name = input_var_names_[i]; - auto* var = scope->Var(name); + if (tensor_arr.size() == 1) { + // input array length = 1, treat input type as LoDTensor + // FIXME(dkp): this may incur error if batch size = 1 + auto tensor = tensor_arr[0]; + if (!tensor.IsInitialized()) return false; + + // get dst variable from scope and check status + auto name = input_var_names_[i]; + auto* var = scope->Var(name); + + // share input tensor to dst variable + auto* dst_tensor = var->GetMutable(); + dst_tensor->ShareDataWith(tensor); + dst_tensor->set_lod(tensor.lod()); + } else { + // input array length > 1 treat input type as LoDTensorArray + for (auto tensor: tensor_arr) { + if (!tensor.IsInitialized()) return false; + } - // share input tensor to variable - auto* dst_tensor = var->GetMutable(); - dst_tensor->ShareDataWith(tensor); - dst_tensor->set_lod(tensor.lod()); + // get dst variable from scope and check status + auto name = input_var_names_[i]; + auto* var = scope->Var(name); + + // share input tensor to dst variable + auto& dst_tensor_arr = *(var->GetMutable()); + dst_tensor_arr.clear(); + dst_tensor_arr.reserve(tensor_arr.size()); + for (size_t i = 0; i < tensor_arr.size(); i++) { + dst_tensor_arr.emplace_back(tensor_arr[i]); + // auto tensor = tensor_arr[i]; + // auto dst_tensor = dst_tensor_arr[i]; + // // dst_tensor.Resize(tensor.dims()); + // // dst_tensor.mutable_data(tensor.place(), tensor.type()); + // // dst_tensor.ShareDataWith(tensor); + // copy_tensor(tensor, &dst_tensor); + // // dst_tensor.set_lod(tensor.lod()); + } + } } return true; } @@ -90,29 +112,39 @@ void MapRunner::StartMapThread(const Scope* scope) { auto& scope_ = scope->NewScope(); framework::Executor executor(place_); while (running_.load()) { - // Step1: get input LoDTensor and share into Scope + // Step 1: get input LoDTensor and share into Scope bool success = ShareInputsIntoScope(&scope_); if (!success) { Shutdown(); break; } - // Step2: run ops by executor without fetch + // Step 2: run ops by executor without fetch executor.Run(*map_block_->Program(), &scope_, map_block_->ID(), false, true, std::vector(), false, true); - // Step3: fetch output variable to LoDTensor vector + // Step 3: fetch output variable to LoDTensor vector // and push to output queue for (size_t i = 0; i < output_var_names_.size(); i++) { - framework::LoDTensorArray t_arr(1); auto *out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( - "The output variable %s is not found in DataLoader " + "The output variable %s is not found in Map " "program's internal scope", output_var_names_[i])); CheckOutputVarStatus(*out_var, output_var_names_[i]); - copy_tensor(out_var->Get(), &t_arr[0]); - output_queues_[i]->Push(t_arr); + + if (out_var->IsType()) { + framework::LoDTensorArray t_arr(1); + copy_tensor(out_var->Get(), &t_arr[0]); + output_queues_[i]->Push(t_arr); + } else { + auto out_arr = out_var->Get(); + framework::LoDTensorArray t_arr(out_arr.size()); + for (size_t i = 0; i < out_arr.size(); i++) { + copy_tensor(out_arr[i], &t_arr[i]); + } + output_queues_[i]->Push(t_arr); + } } } }); @@ -120,19 +152,26 @@ void MapRunner::StartMapThread(const Scope* scope) { void MapRunner::CheckOutputVarStatus(const Variable &var, const std::string &var_name) { - // only LoDTensor variable type support currently - PADDLE_ENFORCE_EQ( - var.IsType(), true, - platform::errors::InvalidArgument( - "The output variable %s get from DataLoader program's " - "internal scope holds wrong type. Expect type is " - "LoDTensor, but receive type is %s.", - var_name, platform::demangle(framework::ToTypeName(var.Type())))); - PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, - platform::errors::InvalidArgument( - "The tensor in output variable %s get from DataLoader " - "program's internal scope is not initialized.", - var_name)); + // only LoDTensor & LoDTensorArray variable type support currently + if (var.IsType()) { + PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from Map" + "program's internal scope is not initialized.", + var_name)); + } else if (var.IsType()) { + auto tensor_array = var.Get(); + for (auto tensor: tensor_array) { + PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in LoDTensorArray of output " + "variable %s get from Map program's internal " + "scope is not initialized.", var_name)); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "MapOp can only support LoDTensor or LoDTensorArray")); + } } void MapRunner::Shutdown() { @@ -144,6 +183,7 @@ void MapRunner::Shutdown() { // set running_ as false to exit map thread, then release thread pool running_.store(false); + // FIXME: ThreadPool doesn't have shutdown method delete &thread_pool_; } diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 333b4017ca2488..b0aec606fdc8a2 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -29,6 +29,7 @@ using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 459d363a59b7dc..428e63572d72c3 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -27,6 +27,7 @@ namespace paddle { namespace operators { using LoDTensorArray = framework::LoDTensorArray; +using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; enum BufferStatus { @@ -91,7 +92,8 @@ void Buffer::Close() { class FileDataReader { public: - explicit FileDataReader(const framework::ExecutionContext& ctx) { + explicit FileDataReader(const framework::ExecutionContext& ctx, + LoDTensorBlockingQueue* queue) { std::vector files = ctx.Attr>("files"); std::vector labels = ctx.Attr>("labels"); @@ -106,7 +108,7 @@ class FileDataReader { is_closed_ = false; for (int i = 0, n = files.size(); i < n; i++) image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); - StartLoadThread(); + StartLoadThread(queue); } int GetStartIndex() { @@ -135,14 +137,13 @@ class FileDataReader { return out; } - void StartLoadThread() { + void StartLoadThread(LoDTensorBlockingQueue* queue) { if (load_thrd_.joinable()) { return; } - load_thrd_ = std::thread([this] { - while (!is_closed_.load() && LoadBatch()) { - } + load_thrd_ = std::thread([this, queue] { + while (!is_closed_.load()) LoadBatch(queue); }); } @@ -159,16 +160,17 @@ class FileDataReader { return ret; } - LoDTensorArray Next() { - LoDTensorArray batch_data; - batch_buffer_.Pull(&batch_data); - return batch_data; - } - - bool LoadBatch() { + // LoDTensorArray Next() { + // LoDTensorArray batch_data; + // batch_buffer_.Pull(&batch_data); + // return batch_data; + // } + // + void LoadBatch(LoDTensorBlockingQueue* queue) { // std::cout << "start LoadBatch 0.01" << std::endl; LoDTensorArray batch_data = std::move(Read()); - return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; + queue->Push(batch_data); + // return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; } private: @@ -187,8 +189,9 @@ class FileDataReader { class FileDataReaderWrapper { public: - void SetUp(const framework::ExecutionContext& ctx) { - reader.reset(new FileDataReader(ctx)); + void SetUp(const framework::ExecutionContext& ctx, + LoDTensorBlockingQueue* queue) { + reader.reset(new FileDataReader(ctx, queue)); } std::shared_ptr reader = nullptr; @@ -196,22 +199,6 @@ class FileDataReaderWrapper { FileDataReaderWrapper reader_wrapper; -static void CheckAndInitQueue(framework::Variable* var, int capacity) { - if (var->IsInitialized()) { - PADDLE_ENFORCE_EQ(var->IsType(), true, - platform::errors::InvalidArgument( - "Variable should hold LoDTensorBlockingQueueHolder type")); - auto holder = var->Get(); - if (holder.GetQueue() == nullptr) { - holder.InitOnce(capacity); - } - } else { - LOG(ERROR) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; - auto* holder = var->GetMutable(); - holder->InitOnce(capacity); - } -} - template class CPUFileLabelKernel : public framework::OpKernel { public: @@ -248,27 +235,27 @@ class FileLabelReaderOp : public framework::OperatorBase { auto& dev_ctx = *pool.Get(dev_place); framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); - if (reader_wrapper.reader == nullptr) { - // create reader - reader_wrapper.SetUp(ctx); - } - LoDTensorArray samples = reader_wrapper.reader->Next(); + auto* out = scope.FindVar(Output("Out")); - // auto* holder = out->template GetMutable(); auto out_queue = out->Get().GetQueue(); if (out_queue == nullptr) { - LOG(ERROR) << "init output queue"; + LOG(ERROR) << "FileLabelReaderOp init output queue"; auto* holder = out->template GetMutable(); holder->InitOnce(2); out_queue = holder->GetQueue(); } - framework::LoDTensorArray out_array; - out_array.resize(samples.size()); - for (size_t i = 0; i < samples.size(); ++i) { - copy_tensor(samples[i], &out_array[i]); + if (reader_wrapper.reader == nullptr) { + // create reader + reader_wrapper.SetUp(ctx, out_queue.get()); } - out_queue->Push(out_array); + // LoDTensorArray samples = reader_wrapper.reader->Next(); + // framework::LoDTensorArray out_array; + // out_array.resize(samples.size()); + // for (size_t i = 0; i < samples.size(); ++i) { + // copy_tensor(samples[i], &out_array[i]); + // } + // out_queue->Push(out_array); LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; } diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/random_crop_and_resize_op.cc index a0d7e79973a59a..9cb6c27d3d3e9d 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/random_crop_and_resize_op.cc @@ -48,8 +48,6 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - // return framework::OpKernelType( - // OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); return framework::OpKernelType( framework::proto::VarType::UINT8, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/random_crop_and_resize_op.cu index c9bc1a3a6f6cae..9b4bb2fc30ca35 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/random_crop_and_resize_op.cu @@ -179,7 +179,6 @@ static void RandomCropAndResizeFwd( } else { dim_out = {out_h, out_w, c}; } - // auto output_data = output->template mutable_data(ctx.GetPlace()); auto output_data = output->data(); if (img_h == crop_h && img_w == crop_w) { @@ -279,30 +278,29 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); // get input, output - // auto& x = ctx.MultiInput("X"); - // auto* x = ctx.Input("X"); - // PADDLE_ENFORCE_GT(x->size(), 0, - // platform::errors::InvalidArgument( - // "The size of X must be greater than 0.")); - // auto* out = ctx.Output("Out"); - - auto* in_var = ctx.InputVar("X"); - auto in_queue = in_var->Get().GetQueue(); - - auto* out_var = ctx.OutputVar("Out"); - auto out_queue = out_var->Get().GetQueue(); - if (out_queue == nullptr) { - LOG(ERROR) << "RandomCropAndResize out_queue init"; - auto* holder = out_var->template GetMutable(); - holder->InitOnce(2); - out_queue = holder->GetQueue(); - } - - bool success = false; - auto x = in_queue->Pop(&success); - PADDLE_ENFORCE_EQ(success, true, - platform::errors::PreconditionNotMet("Read from input queue failed")); - framework::LoDTensor out; + auto* x = ctx.Input("X"); + PADDLE_ENFORCE_GT(x->size(), 0, + platform::errors::InvalidArgument( + "The size of X must be greater than 0.")); + auto* out = ctx.Output("Out"); + + // auto* in_var = ctx.InputVar("X"); + // auto in_queue = in_var->Get().GetQueue(); + // + // auto* out_var = ctx.OutputVar("Out"); + // auto out_queue = out_var->Get().GetQueue(); + // if (out_queue == nullptr) { + // LOG(ERROR) << "RandomCropAndResize out_queue init"; + // auto* holder = out_var->template GetMutable(); + // holder->InitOnce(2); + // out_queue = holder->GetQueue(); + // } + // + // bool success = false; + // auto x = in_queue->Pop(&success); + // PADDLE_ENFORCE_EQ(success, true, + // platform::errors::PreconditionNotMet("Read from input queue failed")); + // framework::LoDTensor out; // get size, scale, ratio auto size = ctx.Attr>("size"); @@ -319,18 +317,18 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); - auto* img = &x.at(0); + auto* img = &x->at(0); int64_t img_c = data_layout == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; - std::vector out_dim = {static_cast(x.size()), + std::vector out_dim = {static_cast(x->size()), img_c, size[0], size[1]}; - out.Resize(framework::make_ddim(out_dim)); - out.mutable_data(ctx.GetPlace()); + out->Resize(framework::make_ddim(out_dim)); + out->mutable_data(ctx.GetPlace()); int img_h, img_w, idx_h, idx_w, crop_h, crop_w; - for (int i = 0; i < x.size(); i++) { - img = &x.at(i); + for (int i = 0; i < x->size(); i++) { + img = &x->at(i); img_h = data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = @@ -338,16 +336,16 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, &crop_w, seed); - auto out_tensor = out.Slice(i, i + 1); + auto out_tensor = out->Slice(i, i + 1); RandomCropAndResizeFwd(ctx, *img, &out_tensor, size, interp_method, align_corners, align_mode, img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w, data_layout); } - framework::LoDTensorArray out_array; - out_array.reserve(1); - out_array.emplace_back(out); - out_queue->Push(out_array); + // framework::LoDTensorArray out_array; + // out_array.reserve(1); + // out_array.emplace_back(out); + // out_queue->Push(out_array); LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute finish"; } }; diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index f835be5a45309c..102b2eb6af8f05 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -59,7 +59,7 @@ def map(map_func, inputs): program_inputs = [ map_block.create_var( name=unique_name.generate("map_sub"), - type=core.VarDesc.VarType.LOD_TENSOR, + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, persistable=False) for i in range(len(inputs))] program_outputs = map_func(*program_inputs) program_outputs = _to_list(program_outputs) @@ -78,6 +78,9 @@ def map(map_func, inputs): "input_var_names": input_var_names, "output_var_names": output_var_names } + print("attr: ", attrs) + import sys + sys.stdout.flush() helper.append_op( type="map", From 2e89ad33051d0852a59e236690c28ced6688d225 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 9 Dec 2021 12:27:45 +0000 Subject: [PATCH 27/95] fix map input type --- paddle/fluid/operators/data/batch_decode_op.cc | 4 ---- python/paddle/fluid/dataloader/ops.py | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index 6c78118a8e0faa..bacdaa543020bb 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -31,7 +31,6 @@ class CPUBatchDecodeJpegKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { // TODO(LieLinJiang): add cpu implement. - LOG(ERROR) << "CPUBatchDecodeJpegKernel enter"; PADDLE_THROW(platform::errors::Unimplemented( "DecodeJpeg op only supports GPU now.")); } @@ -65,8 +64,6 @@ class BatchDecodeJpegOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - // return framework::OpKernelType( - // OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); return framework::OpKernelType( framework::proto::VarType::UINT8, ctx.GetPlace()); } @@ -74,7 +71,6 @@ class BatchDecodeJpegOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const { - LOG(ERROR) << "GetKernelTypeForVar enter "; if (var_name == "X") { return expected_kernel_type; } diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 102b2eb6af8f05..3b28ad0e08c040 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -59,8 +59,8 @@ def map(map_func, inputs): program_inputs = [ map_block.create_var( name=unique_name.generate("map_sub"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - persistable=False) for i in range(len(inputs))] + type=inputs[0].desc.type(), + persistable=False) for inp in inputs] program_outputs = map_func(*program_inputs) program_outputs = _to_list(program_outputs) @@ -70,8 +70,8 @@ def map(map_func, inputs): outputs = \ [helper.create_variable( name=unique_name.generate("map"), - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=True) for _ in range(len(program_outputs))] + type=outp.desc.type(), + persistable=True) for outp in program_outputs] attrs = { "map_block": map_block, "program_id": program_id, From ff3a7cd7886ca407c75b70c372eb975a50fc4a59 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 15 Dec 2021 15:00:09 +0000 Subject: [PATCH 28/95] use new CUDADeviceContext in map op --- paddle/fluid/framework/operator.cc | 8 +++- paddle/fluid/platform/device_context.cc | 55 +++++++++++++++++++++++++ paddle/fluid/platform/device_context.h | 29 +++++++++++++ paddle/fluid/platform/init.cc | 1 + python/paddle/fluid/dataloader/ops.py | 22 ++++++++++ 5 files changed, 114 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e75fb4e36336ab..c1fcc226e892e0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1110,7 +1110,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place, RuntimeContext* runtime_ctx) const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + auto* dev_ctx = HasAttr("stream_id") ? + platform::AsyncDeviceContextPool::Instance().Get( + place, Attr("stream_id")) : nullptr; + if (dev_ctx == nullptr) { + dev_ctx = pool.Get(place); + } + LOG(ERROR) << "Op RunImpl " << DebugString() << " dev_ctx " << dev_ctx; #ifdef PADDLE_WITH_ASCEND_CL // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index cc3aab3ecdb7c5..7b4fc7b463e44d 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -181,6 +181,61 @@ DeviceContextPool::DeviceContextPool( } } +AsyncDeviceContextPool* AsyncDeviceContextPool::pool = nullptr; + +platform::DeviceContext* AsyncDeviceContextPool::Get(const platform::Place& place, const int64_t stream_id) { + VLOG(6) << "AsyncDeviceContextPool Get: " << place << ", " << stream_id; + if (!platform::is_gpu_place(place)) return nullptr; + + auto place_it = device_contexts_.find(place); + if (place_it == device_contexts_.end()) { + PADDLE_THROW(platform::errors::Unimplemented( + "Place %s is not supported. Please check that your paddle compiles " + "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that " + "your train process set the correct device id if you use Executor.", + place)); + } + + if (device_contexts_[place].count(stream_id) > 0) { + return device_contexts_[place][stream_id].get(); + } else { + auto* dev_ctx = new CUDADeviceContext(BOOST_GET_CONST(CUDAPlace, place)); + LOG(ERROR) << "craete dev_ctx " << dev_ctx << " with stream " << dev_ctx->stream(); + device_contexts_[place].emplace(stream_id, std::unique_ptr(dev_ctx)); + return dev_ctx; + } + // auto stream_map = place_it->second; + // auto stream_it = stream_map.find(stream_id); + // if (stream_it == stream_map.end()) { + // // auto dev_ctx = std::unique_ptr(new CUDADeviceContext(BOOST_GET_CONST(CUDAPlace, place))); + // // stream_map.emplace(stream_id, dev_ctx); + // // return dev_ctx.get(); + // } else { + // // return stream_it->second.get(); + // } + // return nullptr; +} + +AsyncDeviceContextPool::AsyncDeviceContextPool( + const std::vector& places) { + PADDLE_ENFORCE_GT( + places.size(), 0, + platform::errors::InvalidArgument("The number of platform places should " + "be larger than 0. But received %d.", + places.size())); + std::set set; + for (auto& p : places) { + set.insert(p); + } + for (auto& p : set) { + if (platform::is_gpu_place(p)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + device_contexts_.emplace(p, std::map>()); +#endif + } + } +} + CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 13a1040dd19df2..1189e006a5ada2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -855,5 +855,34 @@ class DeviceContextPool { DISABLE_COPY_AND_ASSIGN(DeviceContextPool); }; +/*! \brief async device context pool singleton */ +class AsyncDeviceContextPool { + public: + explicit AsyncDeviceContextPool(const std::vector& places); + + static AsyncDeviceContextPool& Instance() { + PADDLE_ENFORCE_NOT_NULL(pool, + platform::errors::PreconditionNotMet( + "Need to Create DeviceContextPool firstly!")); + return *pool; + } + + /*! \brief Create should only called by Init function */ + static AsyncDeviceContextPool& Init(const std::vector& places) { + if (pool == nullptr) { + pool = new AsyncDeviceContextPool(places); + } + return *pool; + } + + /*! \brief Return handle of single device context. */ + platform::DeviceContext* Get(const platform::Place& place, const int64_t stream_id); + + private: + static AsyncDeviceContextPool* pool; + std::map>> device_contexts_; + DISABLE_COPY_AND_ASSIGN(AsyncDeviceContextPool); +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 290b3353ae54cc..7c78e4d6c00ff2 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -189,6 +189,7 @@ void InitDevices(const std::vector devices) { places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) places.emplace_back(platform::CUDAPinnedPlace()); + platform::AsyncDeviceContextPool::Init(places); #endif platform::DeviceContextPool::Init(places); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 3b28ad0e08c040..721739004029c1 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -44,6 +44,22 @@ def __exit__(self, exc_type, exc_val, exc_tb): return exc_type is None +class _StreamIDGenerator(object): + def __init__(self): + self.stream_id = 0 + + def get_stream_id(self): + self.stream_id += 1 + return self.stream_id - 1 + + +_stream_id_generator = _StreamIDGenerator() + + +def _generate_stream_id(): + return _stream_id_generator.get_stream_id() + + def map(map_func, inputs): assert not in_dygraph_mode(), \ "paddle.io.map can only be used in static mode" @@ -79,6 +95,12 @@ def map(map_func, inputs): "output_var_names": output_var_names } print("attr: ", attrs) + + stream_id = _generate_stream_id() + for idx in range(map_block.desc.op_size()): + map_block.desc.op(idx)._set_attr('stream_id', stream_id) + print("map_block", map_block.desc.op(idx).attr_names()) + import sys sys.stdout.flush() From 18cd9075c826e4796950f24026aa1b994290d038 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 15 Dec 2021 15:15:35 +0000 Subject: [PATCH 29/95] simplify log --- paddle/fluid/framework/operator.cc | 1 - paddle/fluid/operators/data/map_op.cc | 4 ++-- paddle/fluid/platform/device_context.cc | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c1fcc226e892e0..75b1ba8fd87d53 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1116,7 +1116,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (dev_ctx == nullptr) { dev_ctx = pool.Get(place); } - LOG(ERROR) << "Op RunImpl " << DebugString() << " dev_ctx " << dev_ctx; #ifdef PADDLE_WITH_ASCEND_CL // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 622dbbac852615..41b80a38fd0858 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -42,7 +42,7 @@ class MapOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - LOG(ERROR) << "MapOpKernel RunImpl enter"; + // LOG(ERROR) << "MapOpKernel RunImpl enter"; // Step1: get output vars and attrs // FIXME(dkp): multi input support auto input_var = scope.FindVar(Input("In")); @@ -68,7 +68,7 @@ class MapOp : public framework::OperatorBase { map_block, program_id, &scope, dev_place, input_var_names, output_var_names, input_queues, output_queues); - LOG(ERROR) << "MapOpKernel RunImpl finish"; + // LOG(ERROR) << "MapOpKernel RunImpl finish"; } }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 7b4fc7b463e44d..30ebe88acfe9d6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -200,7 +200,7 @@ platform::DeviceContext* AsyncDeviceContextPool::Get(const platform::Place& plac return device_contexts_[place][stream_id].get(); } else { auto* dev_ctx = new CUDADeviceContext(BOOST_GET_CONST(CUDAPlace, place)); - LOG(ERROR) << "craete dev_ctx " << dev_ctx << " with stream " << dev_ctx->stream(); + // LOG(ERROR) << "craete dev_ctx " << dev_ctx << " with stream " << dev_ctx->stream(); device_contexts_[place].emplace(stream_id, std::unique_ptr(dev_ctx)); return dev_ctx; } From 8c74403f737138b07faeabcabf1fe1c116c02217 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 21 Dec 2021 09:04:54 +0000 Subject: [PATCH 30/95] random flip success --- paddle/fluid/operators/flip_op.cc | 1 + paddle/fluid/operators/flip_op.cu | 1 + python/paddle/fluid/dataloader/ops.py | 7 ++++--- python/paddle/vision/ops.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index 5e6d263f1907b5..a08a0ca142053a 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -153,6 +153,7 @@ REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType, REGISTER_OP_CPU_KERNEL( flip, ops::FlipKernel, ops::FlipKernel, + ops::FlipKernel, ops::FlipKernel, ops::FlipKernel, ops::FlipKernel, diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu index 26b3d11bc6c7b7..bf5615b4a32dd3 100644 --- a/paddle/fluid/operators/flip_op.cu +++ b/paddle/fluid/operators/flip_op.cu @@ -162,6 +162,7 @@ REGISTER_OP_CUDA_KERNEL( flip, ops::FlipKernel, ops::FlipKernel, ops::FlipKernel, + ops::FlipKernel, ops::FlipKernel, ops::FlipKernel, ops::FlipKernel, diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 721739004029c1..91383f04aeaed2 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -88,18 +88,19 @@ def map(map_func, inputs): name=unique_name.generate("map"), type=outp.desc.type(), persistable=True) for outp in program_outputs] + stream_id = _generate_stream_id() attrs = { "map_block": map_block, "program_id": program_id, "input_var_names": input_var_names, "output_var_names": output_var_names } - print("attr: ", attrs) + print(stream_id, "attr: ", attrs) - stream_id = _generate_stream_id() + # stream_id = _generate_stream_id() for idx in range(map_block.desc.op_size()): map_block.desc.op(idx)._set_attr('stream_id', stream_id) - print("map_block", map_block.desc.op(idx).attr_names()) + print("map_block", stream_id, map_block.desc.op(idx).type()) import sys sys.stdout.flush() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 3c03418a0539ca..3f6332e9f35286 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -20,6 +20,7 @@ from ..nn import Layer from ..fluid.initializer import Normal +import paddle from paddle.common_ops_import import * from paddle import _C_ops @@ -30,6 +31,7 @@ 'DeformConv2D', 'read_file', 'decode_jpeg', + 'random_flip', 'roi_pool', 'RoIPool', 'psroi_pool', @@ -969,6 +971,19 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): return out +def random_flip(x, batch_size, prob=0.5, name=None): + if prob < 0. or prob > 1.: + raise ValueError("prob should in (0, 1) in random_flip") + + p = paddle.uniform([batch_size, 1], min=0., max=1.) + ie = layers.IfElse(p < prob) + with ie.true_block(): + out = ie.input(x) + out = paddle.flip(x, -1) + ie.output(out) + return ie()[0] + + def decode_jpeg(x, mode='unchanged', name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. From 6e4b45f34e7e48a35ae000fa3a439a6285856f47 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 21 Dec 2021 12:57:40 +0000 Subject: [PATCH 31/95] polish log --- paddle/fluid/operators/data/map_op.h | 2 +- python/paddle/fluid/dataloader/ops.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index b52c060c223bd9..2431b9e6a0932c 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -44,9 +44,9 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); if (queue == nullptr) { - LOG(ERROR) << "MapOpKernel init queue"; auto* holder = var->template GetMutable(); holder->InitOnce(capacity); + LOG(ERROR) << "MapOpKernel init queue" << holder->GetQueue(); } } else { VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 91383f04aeaed2..357035e23618e1 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -95,12 +95,11 @@ def map(map_func, inputs): "input_var_names": input_var_names, "output_var_names": output_var_names } - print(stream_id, "attr: ", attrs) + print("stream id: ", stream_id, "attr: ", attrs) # stream_id = _generate_stream_id() for idx in range(map_block.desc.op_size()): map_block.desc.op(idx)._set_attr('stream_id', stream_id) - print("map_block", stream_id, map_block.desc.op(idx).type()) import sys sys.stdout.flush() From a06c26d1184ff446a6c9b8a1aa448a9e8db009e1 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 30 Dec 2021 03:11:49 +0000 Subject: [PATCH 32/95] add SetROI in nvjpeg decoder --- paddle/fluid/operators/data/CMakeLists.txt | 5 +- .../fluid/operators/data/batch_decode_op.cc | 126 ---------- .../data/batch_decode_random_crop_op.cc | 151 ++++++++++++ ...e_op.cu => batch_decode_random_crop_op.cu} | 28 ++- .../data/batch_decode_random_crop_op.h | 44 ++++ paddle/fluid/operators/data/map_runner.cc | 35 ++- paddle/fluid/operators/data/map_runner.h | 22 +- paddle/fluid/operators/data/nvjpeg_decoder.cc | 30 ++- paddle/fluid/operators/data/nvjpeg_decoder.h | 14 +- paddle/fluid/operators/data/pipeline.cc | 6 +- paddle/fluid/operators/data/pipeline.h | 13 +- .../operators/data/random_roi_generator.cc | 104 +++++++++ .../operators/data/random_roi_generator.h | 60 +++++ paddle/fluid/operators/data/shutdown.h | 47 ++++ .../operators/data/unity_build_rule.cmake | 5 +- .../fluid/operators/file_label_reader_op.cc | 182 +-------------- paddle/fluid/operators/file_label_reader_op.h | 219 ++++++++++++++++++ paddle/fluid/platform/dynload/nvjpeg.h | 1 + paddle/fluid/pybind/pybind.cc | 4 + python/paddle/fluid/core.py | 2 + python/paddle/fluid/dataloader/pipeline.py | 11 +- python/paddle/vision/ops.py | 40 +++- 22 files changed, 783 insertions(+), 366 deletions(-) delete mode 100644 paddle/fluid/operators/data/batch_decode_op.cc create mode 100644 paddle/fluid/operators/data/batch_decode_random_crop_op.cc rename paddle/fluid/operators/data/{batch_decode_op.cu => batch_decode_random_crop_op.cu} (78%) create mode 100644 paddle/fluid/operators/data/batch_decode_random_crop_op.h create mode 100644 paddle/fluid/operators/data/random_roi_generator.cc create mode 100644 paddle/fluid/operators/data/random_roi_generator.h create mode 100644 paddle/fluid/operators/data/shutdown.h create mode 100644 paddle/fluid/operators/file_label_reader_op.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 906d280a77b059..6c0806249d8345 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -10,8 +10,9 @@ op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) -cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS ${OP_HEADER_DEPS}) -op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) +cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) +cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) +op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc deleted file mode 100644 index bacdaa543020bb..00000000000000 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace data { - -template -class CPUBatchDecodeJpegKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // TODO(LieLinJiang): add cpu implement. - PADDLE_THROW(platform::errors::Unimplemented( - "DecodeJpeg op only supports GPU now.")); - } -}; - -class BatchDecodeJpegOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); - - auto mode = ctx->Attrs().Get("mode"); - std::vector out_dims; - - if (mode == "unchanged") { - out_dims = {-1, -1, -1}; - } else if (mode == "gray") { - out_dims = {1, -1, -1}; - } else if (mode == "rgb") { - out_dims = {3, -1, -1}; - } else { - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU: ", mode)); - } - - // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::UINT8, ctx.GetPlace()); - } - - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const { - if (var_name == "X") { - return expected_kernel_type; - } - - return framework::OpKernelType(tensor.type(), tensor.place(), - tensor.layout()); - } -}; - -class BatchDecodeJpegInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - } -}; - -class BatchDecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "A one dimensional uint8 tensor containing the raw bytes " - "of the JPEG image. It is a tensor with rank 1."); - AddOutput("Out", "The output tensor of DecodeJpeg op"); - AddComment(R"DOC( -This operator decodes a JPEG image into a 3 dimensional RGB Tensor -or 1 dimensional Gray Tensor. Optionally converts the image to the -desired format. The values of the output tensor are uint8 between 0 -and 255. -)DOC"); - AddAttr("num_threads", "Path of the file to be readed.") - .SetDefault(2); - AddAttr( - "mode", - "(string, default \"unchanged\"), The read mode used " - "for optionally converting the image, can be \"unchanged\" " - ",\"gray\" , \"rgb\" .") - .SetDefault("unchanged"); - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR( - batch_decode, ops::data::BatchDecodeJpegOp, ops::data::BatchDecodeJpegOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker) - -REGISTER_OP_CPU_KERNEL(batch_decode, ops::data::CPUBatchDecodeJpegKernel) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc new file mode 100644 index 00000000000000..27677e5c4bad08 --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/batch_decode_random_crop_op.h" + +namespace paddle { +namespace operators { +namespace data { + +class BatchDecodeRandomCropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + auto aspect_ratio_min = ctx->Attrs().Get("aspect_ratio_min"); + auto aspect_ratio_max = ctx->Attrs().Get("aspect_ratio_max"); + PADDLE_ENFORCE_GT(aspect_ratio_min, 0., + platform::errors::InvalidArgument( + "aspect_ratio_min should be greater than 0, but received " + "%f", aspect_ratio_min)); + PADDLE_ENFORCE_GT(aspect_ratio_max, 0., + platform::errors::InvalidArgument( + "aspect_ratio_max should be greater than 0, but received " + "%f", aspect_ratio_max)); + PADDLE_ENFORCE_GE(aspect_ratio_max, aspect_ratio_min, + platform::errors::InvalidArgument( + "aspect_ratio_max should be greater than aspect_ratio_min, " + "but received aspect_ratio_max(%d) < aspect_ratio_min(%d)", + aspect_ratio_max, aspect_ratio_min)); + + auto area_min = ctx->Attrs().Get("area_min"); + auto area_max = ctx->Attrs().Get("area_max"); + PADDLE_ENFORCE_GT(area_min, 0., + platform::errors::InvalidArgument( + "area_minshould be greater than 0, but received " + "%f", area_min)); + PADDLE_ENFORCE_GT(area_max, 0., + platform::errors::InvalidArgument( + "area_max should be greater than 0, but received " + "%f", area_max)); + PADDLE_ENFORCE_GE(area_max, area_min, + platform::errors::InvalidArgument( + "area_max should be greater than area_min, " + "but received area_max(%f) < area_min(%f)", + area_max, area_min)); + + auto num_attempts= ctx->Attrs().Get("num_attempts"); + PADDLE_ENFORCE_GT(num_attempts, 0, + platform::errors::InvalidArgument( + "num_attempts should be a positive integerm, but " + "received %d", num_attempts)); + + // auto mode = ctx->Attrs().Get("mode"); + // std::vector out_dims; + // + // if (mode == "unchanged") { + // out_dims = {-1, -1, -1}; + // } else if (mode == "gray") { + // out_dims = {1, -1, -1}; + // } else if (mode == "rgb") { + // out_dims = {3, -1, -1}; + // } else { + // PADDLE_THROW(platform::errors::Fatal( + // "The provided mode is not supported for JPEG files on GPU: ", mode)); + // } + // + // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::UINT8, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class BatchDecodeRandomCropInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + } +}; + +class BatchDecodeRandomCropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr("num_threads", "Path of the file to be readed.") + .SetDefault(2); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + AddAttr("aspect_ratio_min", "").SetDefault(3./4.); + AddAttr("aspect_ratio_max", "").SetDefault(4./3.); + AddAttr("area_min", "").SetDefault(0.08); + AddAttr("area_max", "").SetDefault(1.); + AddAttr("num_attempts", "").SetDefault(10); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + batch_decode_random_crop, ops::data::BatchDecodeRandomCropOp, ops::data::BatchDecodeRandomCropOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(batch_decode_random_crop, ops::data::CPUBatchDecodeRandomCropKernel) diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu similarity index 78% rename from paddle/fluid/operators/data/batch_decode_op.cu rename to paddle/fluid/operators/data/batch_decode_random_crop_op.cu index b25b8b383f3ef8..e8fe313d6c4883 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -14,11 +14,7 @@ #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/batch_decode_random_crop_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { @@ -27,10 +23,11 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; -static NvjpegDecoderThreadPool* decode_pool = nullptr; +NvjpegDecoderThreadPool* decode_pool = nullptr; +// std::seed_seq* rand_seq = nullptr; template -class GPUBatchDecodeJpegKernel : public framework::OpKernel { +class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int num_threads = ctx.Attr("num_threads"); @@ -41,6 +38,7 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { if (!decode_pool) { LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); + // rand_seq = new std::seed_seq(static_cast(time(0))); } const framework::LoDTensorArray* inputs = @@ -50,6 +48,18 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { auto& out_array = *out->GetMutable(); out_array.resize(inputs->size()); + auto aspect_ratio_min = ctx.Attr("aspect_ratio_min"); + auto aspect_ratio_max = ctx.Attr("aspect_ratio_max"); + AspectRatioRange aspect_ratio_range{aspect_ratio_min, aspect_ratio_max}; + + auto area_min = ctx.Attr("area_min"); + auto area_max = ctx.Attr("area_max"); + AreaRange area_range{area_min, area_max}; + + std::seed_seq rand_seq{static_cast(time(0))}; + std::vector rands(inputs->size()); + rand_seq.generate(rands.begin(), rands.end()); + // auto* in_var = ctx.InputVar("X"); // auto in_queue = in_var->Get().GetQueue(); // @@ -78,6 +88,8 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { .bit_stream = x_data, .bit_len = x_numel, .tensor = &out_array[i], + .roi_generator = new RandomROIGenerator( + aspect_ratio_range, area_range, rands[i]), .place = ctx.GetPlace() }; decode_pool->AddTask(std::make_shared(task)); @@ -115,6 +127,6 @@ class GPUBatchDecodeJpegKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(batch_decode, ops::data::GPUBatchDecodeJpegKernel) +REGISTER_OP_CUDA_KERNEL(batch_decode_random_crop, ops::data::GPUBatchDecodeRandomCropKernel) #endif diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h new file mode 100644 index 00000000000000..592adf1c563da7 --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -0,0 +1,44 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/data/nvjpeg_decoder.h" + + +namespace paddle { +namespace operators { +namespace data { + +template +class CPUBatchDecodeRandomCropKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "DecodeJpeg op only supports GPU now.")); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index c4f4c3f53bd019..5c687eecfec79b 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -108,19 +108,22 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { } void MapRunner::StartMapThread(const Scope* scope) { - thread_pool_.enqueue([this, scope]() -> void { + results_.emplace_back(thread_pool_.enqueue([this, scope]() -> bool { auto& scope_ = scope->NewScope(); framework::Executor executor(place_); while (running_.load()) { // Step 1: get input LoDTensor and share into Scope + // LOG(ERROR) << "MapThread Loop " << program_id_ << " start"; bool success = ShareInputsIntoScope(&scope_); if (!success) { - Shutdown(); - break; + ShutDown(); + return false; } + LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; // Step 2: run ops by executor without fetch executor.Run(*map_block_->Program(), &scope_, map_block_->ID(), false, true, std::vector(), false, true); + LOG(ERROR) << "MapThread Loop " << program_id_ << " program run finish"; // Step 3: fetch output variable to LoDTensor vector // and push to output queue @@ -146,8 +149,12 @@ void MapRunner::StartMapThread(const Scope* scope) { output_queues_[i]->Push(t_arr); } } + LOG(ERROR) << "MapThread Loop " << program_id_ << " push queue finish"; } - }); + scope->DeleteScope(&scope_); + LOG(ERROR) << "MapThread Loop " << program_id_ << " delete scope and return"; + return true; + })); } void MapRunner::CheckOutputVarStatus(const Variable &var, @@ -174,18 +181,28 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, } } -void MapRunner::Shutdown() { - VLOG(1) << "MapRunner shutdown"; +void MapRunner::ShutDown() { + VLOG(1) << "MapRunner shutdown " << program_id_; // close all output queue, op after this op can shutdown itself + LOG(ERROR) << "MapRunner ShutDown"; for (auto queue : output_queues_) { queue->Close(); } + LOG(ERROR) << "MapRunner ShutDown queue closed " << program_id_; - // set running_ as false to exit map thread, then release thread pool running_.store(false); + LOG(ERROR) << "MapRunner ShutDown running false" << program_id_; + + for (auto&& result: results_) { + LOG(ERROR) << "MapRunner get result " << program_id_; + result.get(); + LOG(ERROR) << "MapRunner get result finish" << program_id_; + } - // FIXME: ThreadPool doesn't have shutdown method - delete &thread_pool_; + // set running_ as false to exit map thread, then release thread pool + // // FIXME: ThreadPool doesn't have shutdown method + // delete &thread_pool_; + // LOG(ERROR) << "MapRunner ShutDown thread_pool_ deleted"; } // initialization static variables out of MapRunnerManager diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index b0aec606fdc8a2..ab3ce8ea7e8a0e 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" -#include "paddle/fluid/operators/data/data_scope.h" namespace paddle { namespace operators { @@ -41,9 +40,6 @@ class MapRunner { const int64_t program_id, const Scope* scope, const platform::Place &place, - // int64_t start_op_index, - // int64_t end_op_index, - // int64_t program_id, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> input_queues, @@ -54,7 +50,7 @@ class MapRunner { // Close(); // } - void Shutdown(); + void ShutDown(); inline bool IsRunning() { return running_; } @@ -76,6 +72,7 @@ class MapRunner { void CheckOutputVarStatus(const Variable &var, const std::string &var_name); ThreadPool thread_pool_; + std::vector> results_; std::atomic running_; std::shared_ptr map_block_; @@ -125,19 +122,28 @@ class MapRunnerManager { } } - void ShutdownMapRunner(int program_id) { + void ShutDownMapRunner(int program_id) { auto iter = prog_id_to_runner_.find(program_id); if (iter != prog_id_to_runner_.end()) { - iter->second.get()->Shutdown(); + iter->second.get()->ShutDown(); prog_id_to_runner_.erase(iter); } } + void ShutDown() { + auto iter = prog_id_to_runner_.begin(); + for (; iter != prog_id_to_runner_.end(); iter++) { + iter->second.get()->ShutDown(); + LOG(ERROR) << "MapRunnerManager prog_id " << iter->first << " shutdown finish"; + } + prog_id_to_runner_.clear(); + } + MapRunnerManager() { VLOG(1) << "MapRunnerManager init"; } ~MapRunnerManager() { VLOG(1) << "~MapRunnerManager"; - prog_id_to_runner_.clear(); + ShutDown(); } }; diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 6b8c0844cfefc2..4becec40416ddb 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -73,9 +73,10 @@ NvjpegDecoder::~NvjpegDecoder() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); } -void NvjpegDecoder::ParseOutputInfo( +void NvjpegDecoder::ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, - nvjpegImage_t* out_image, platform::Place place) { + RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, + platform::Place place) { int components; nvjpegChromaSubsampling_t subsampling; int widths[NVJPEG_MAX_COMPONENT]; @@ -85,8 +86,8 @@ void NvjpegDecoder::ParseOutputInfo( platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, &components, &subsampling, widths, heights)); - int width = widths[0]; - int height = heights[0]; + int64_t width = static_cast(widths[0]); + int64_t height = static_cast(heights[0]); nvjpegOutputFormat_t output_format; int output_components; @@ -115,7 +116,12 @@ void NvjpegDecoder::ParseOutputInfo( PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); - std::vector out_shape = {output_components, height, width}; + ROI roi; + roi_generator->GenerateRandomROI(width, height, &roi); + + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + + std::vector out_shape = {output_components, roi.h, roi.w}; out->Resize(framework::make_ddim(out_shape)); // allocate memory and assign to out_image @@ -144,10 +150,11 @@ void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImag PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); } -void NvjpegDecoder::Run(const uint8_t* bit_stream, size_t bit_len, - framework::LoDTensor* out, platform::Place& place) { +void NvjpegDecoder::Run( + const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, + RandomROIGenerator* roi_generator, platform::Place& place) { nvjpegImage_t image; - ParseOutputInfo(bit_stream, bit_len, out, &image, place); + ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); Decode(bit_stream, bit_len, &image); } @@ -167,7 +174,7 @@ NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const st } } -NvjpegDecoderThreadPool::~NvjpegDecoderThreadPool() { Shutdown(); } +NvjpegDecoderThreadPool::~NvjpegDecoderThreadPool() { ShutDown(); } void NvjpegDecoderThreadPool::AddTask(std::shared_ptr task) { task_queue_.push_back(task); @@ -193,7 +200,7 @@ void NvjpegDecoderThreadPool::WaitTillTasksCompleted() { running_ = false; } -void NvjpegDecoderThreadPool::Shutdown() { +void NvjpegDecoderThreadPool::ShutDown() { std::lock_guard lock(mutex_); running_ = false; @@ -229,7 +236,8 @@ void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { outstand_tasks_++; lock.unlock(); - decoder->Run(task->bit_stream, task->bit_len, task->tensor, task->place); + decoder->Run(task->bit_stream, task->bit_len, task->tensor, + task->roi_generator, task->place); lock.lock(); outstand_tasks_--; diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h index 3467a70b6082be..ebcf1762ade55f 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/fluid/platform/stream/cuda_stream.h" +#include "paddle/fluid/operators/data/random_roi_generator.h" + namespace paddle { namespace operators { namespace data { @@ -35,6 +37,7 @@ struct NvjpegDecodeTask { const uint8_t* bit_stream; size_t bit_len; framework::LoDTensor* tensor; + RandomROIGenerator* roi_generator; platform::Place place; }; @@ -44,15 +47,16 @@ class NvjpegDecoder { ~NvjpegDecoder(); - void Run(const uint8_t* bit_stream, size_t bit_len, - framework::LoDTensor* out, platform::Place& place); + void Run(const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, + RandomROIGenerator* roi_generator, platform::Place& place); private: DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); - void ParseOutputInfo( + void ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, - nvjpegImage_t* out_image, platform::Place place); + RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, + platform::Place place); void Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image); @@ -87,7 +91,7 @@ class NvjpegDecoderThreadPool { void WaitTillTasksCompleted(); - void Shutdown(); + void ShutDown(); private: DISABLE_COPY_AND_ASSIGN(NvjpegDecoderThreadPool); diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 82fa06df714531..17cc9d1e121196 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -138,13 +138,13 @@ void Pipeline::ReadNext(std::vector &out_vars) { } } -inline void Pipeline::Close() { +void Pipeline::ShutDown() { VLOG(1) << "Pipeline close"; + closed_.store(true); prefetch_queue_.Close(); - closed_ = true; } -inline void Pipeline::Reset() { +void Pipeline::Reset() { // (TODO)Step1: reset dataset // // Step2: reopen pipeline diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index bc2776f5675a14..5d297533548258 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" -#include "paddle/fluid/operators/data/data_scope.h" namespace paddle { namespace operators { @@ -59,6 +58,8 @@ class Pipeline { void ReadNext(std::vector &out_vars); + void ShutDown(); + private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { @@ -127,11 +128,19 @@ class PipelineManager { } } + void ShutDown() { + auto iter = prog_id_to_pipeline_.begin(); + for (; iter != prog_id_to_pipeline_.end(); iter++) { + iter->second.get()->ShutDown(); + } + prog_id_to_pipeline_.clear(); + } + PipelineManager() { VLOG(1) << "PipelineManager init"; } ~PipelineManager() { VLOG(1) << "~PipelineManager"; - prog_id_to_pipeline_.clear(); + ShutDown(); } }; diff --git a/paddle/fluid/operators/data/random_roi_generator.cc b/paddle/fluid/operators/data/random_roi_generator.cc new file mode 100644 index 00000000000000..9adc457f5745ef --- /dev/null +++ b/paddle/fluid/operators/data/random_roi_generator.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/data/random_roi_generator.h" + +namespace paddle { +namespace operators { +namespace data { + +RandomROIGenerator::RandomROIGenerator( + AspectRatioRange aspect_ratio_range, AreaRange area_range, + int64_t seed, int64_t num_attempts) + : aspect_ratio_range_(aspect_ratio_range), + area_range_(area_range), + random_generator_(seed), + seed_(seed), + num_attempts_(num_attempts) {} + +void RandomROIGenerator::GenerateRandomROI( + const int64_t width, const int64_t height, ROI* roi) { + if (width <= 0 || height <= 0) return; + + float min_wh_ratio = aspect_ratio_range_.first; + float max_wh_ratio = aspect_ratio_range_.second; + float max_hw_ratio = 1 / aspect_ratio_range_.first; + float min_area = width * height * area_distribution_.a(); + auto max_width = std::max(1, height * max_wh_ratio); + auto max_height = std::max(1, width * max_hw_ratio); + + // process max_width/height cannot satisfy min_area restriction firstly + if (height * max_width < min_area) { + roi->w = max_width; + roi->h = height; + } else if (width * max_height < min_area) { + roi->w = width; + roi->h = max_height; + } else { + int64_t attempts = num_attempts_; + while (attempts-- > 0) { + // calc ROI area + float scale = area_distribution_(random_generator_); + float roi_area = scale * height * width; + + // calc ROI width/height + float ratio = std::exp( + aspect_ratio_distribution_(random_generator_)); + auto w = static_cast( + std::roundf(sqrtf(roi_area * ratio))); + auto h = static_cast( + std::roundf(sqrtf(roi_area / ratio))); + w = std::max(1, w); + h = std::max(1, h); + + // check restrictions + ratio = static_cast(w) / h; + if (w <= width && h <= height + && ratio >= min_wh_ratio && ratio <= max_hw_ratio) { + roi->w = w; + roi->h = h; + break; + } + } + + if (attempts <= 0) { + float max_area = area_distribution_.b() * width * height; + float ratio = static_cast(width) / height; + int64_t w, h; + if (ratio > max_wh_ratio) { + w = max_width; + h = height; + } else if (ratio < min_wh_ratio) { + w = width; + h = max_height; + } else { + w = width; + h = height; + } + float scale = std::min(1.f, max_area / (w * h)); + roi->w = std::max(1, w * sqrtf(scale)); + roi->h = std::max(1, h * sqrtf(scale)); + } + + // generate random left top coordination x, y + roi->x = std::uniform_int_distribution( + 0, width - roi->w)(random_generator_); + roi->y = std::uniform_int_distribution( + 0, height - roi->h)(random_generator_); + } +} + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/random_roi_generator.h b/paddle/fluid/operators/data/random_roi_generator.h new file mode 100644 index 00000000000000..32b540a57fc756 --- /dev/null +++ b/paddle/fluid/operators/data/random_roi_generator.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { +namespace data { + +using AspectRatioRange = std::pair; +using AreaRange = std::pair; + +struct ROI { + // left top coordination (x, y) + int64_t x; + int64_t y; + // width/height of crop window (w, h) + int64_t w; + int64_t h; +}; + +class RandomROIGenerator { + public: + explicit RandomROIGenerator( + AspectRatioRange aspect_ratio_range, AreaRange area_range, + int64_t seed = time(0), int64_t num_attempts = 10); + + void GenerateRandomROI(const int64_t width, const int64_t height, ROI* roi); + + private: + + AspectRatioRange aspect_ratio_range_; + AreaRange area_range_; + + std::uniform_real_distribution aspect_ratio_distribution_; + std::uniform_real_distribution area_distribution_; + std::mt19937 random_generator_; + + int64_t seed_; + int64_t num_attempts_; +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h new file mode 100644 index 00000000000000..22fa6b46add5bb --- /dev/null +++ b/paddle/fluid/operators/data/shutdown.h @@ -0,0 +1,47 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/operators/file_label_reader_op.h" +#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/map_runner.h" +#include "paddle/fluid/operators/data/pipeline.h" + + +namespace paddle { +namespace operators { + +extern FileDataReaderWrapper reader_wrapper; + +namespace data { +extern NvjpegDecoderThreadPool* decode_pool; + +void ShutDownDataLoader() { + LOG(ERROR) << "ShutDownDataLoader enter"; + // step 1: shutdown reader + reader_wrapper.ShutDown(); + + // step 2: shutdown decoder + decode_pool->ShutDown(); + + // step 3: shutdown MapRunner + MapRunnerManager::Instance()->ShutDown(); + + // step 4: shutdown pipeline + PipelineManager::Instance()->ShutDown(); +} +} // namespace data + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index bc15a29e43743c..c49164464bb030 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -7,12 +7,13 @@ register_unity_group(cc pipeline.cc map_runner.cc + random_roi_generator.cc nvjpeg_decoder.cc dataloader_op.cc map_op.cc - batch_decode_op.cc) + batch_decode_random_crop_op.cc) register_unity_group(cu dataloader_op.cu.cc map_op.cu.cc - batch_decode_op.cu) + batch_decode_random_crop_op.cu) diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 428e63572d72c3..55fe8c8f8ea1a2 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -12,191 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include -#include - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/file_label_reader_op.h" namespace paddle { namespace operators { -using LoDTensorArray = framework::LoDTensorArray; -using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; - -enum BufferStatus { - kBufferStatusSuccess = 0, - kBufferStatusErrorClosed, - kBufferStatusEmpty -}; - -template -class Buffer final { - public: - explicit Buffer(size_t max_len = 2) : max_len_(max_len), is_closed_(false) {} - ~Buffer() = default; - - BufferStatus Push(const T& item); - BufferStatus Pull(T* item); - BufferStatus TryReceive(T* item); - void Close(); - - private: - std::queue queue_; - mutable std::mutex mutex_; - size_t max_len_; - bool is_closed_; - std::condition_variable cond_; -}; - -template -BufferStatus Buffer::Push(const T& item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return queue_.size() < max_len_ || is_closed_; }); - if (is_closed_) { - return kBufferStatusErrorClosed; - } - - queue_.push(item); - cond_.notify_one(); - return kBufferStatusSuccess; -} - -template -BufferStatus Buffer::Pull(T* item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return (!queue_.empty()) || is_closed_; }); - if (queue_.empty()) { - return kBufferStatusErrorClosed; - } - *item = queue_.front(); - queue_.pop(); - if (queue_.size() < max_len_) { - cond_.notify_all(); - } - return kBufferStatusSuccess; -} - -template -void Buffer::Close() { - std::unique_lock lock(mutex_); - is_closed_ = true; - cond_.notify_all(); -} - -class FileDataReader { - public: - explicit FileDataReader(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue) { - std::vector files = - ctx.Attr>("files"); - std::vector labels = ctx.Attr>("labels"); - rank_ = ctx.Attr("rank"); - world_size_ = ctx.Attr("world_size"); - // std::cout << "files and labels size: " << files.size() << " " - // << labels.size() << std::endl; - batch_size_ = ctx.Attr("batch_size"); - current_epoch_ = 0; - current_iter_ = 0; - iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); - is_closed_ = false; - for (int i = 0, n = files.size(); i < n; i++) - image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); - StartLoadThread(queue); - } - - int GetStartIndex() { - int start_idx = - batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + - rank_ * batch_size_; - current_iter_++; - return start_idx; - } - - framework::LoDTensor ReadSample(const std::string filename) { - std::ifstream input(filename.c_str(), - std::ios::in | std::ios::binary | std::ios::ate); - std::streamsize file_size = input.tellg(); - - input.seekg(0, std::ios::beg); - - // auto* out = ctx.Output("Out"); - framework::LoDTensor out; - std::vector out_shape = {file_size}; - out.Resize(framework::make_ddim(out_shape)); - - uint8_t* data = out.mutable_data(platform::CPUPlace()); - - input.read(reinterpret_cast(data), file_size); - return out; - } - - void StartLoadThread(LoDTensorBlockingQueue* queue) { - if (load_thrd_.joinable()) { - return; - } - - load_thrd_ = std::thread([this, queue] { - while (!is_closed_.load()) LoadBatch(queue); - }); - } - - LoDTensorArray Read() { - LoDTensorArray ret; - ret.reserve(batch_size_); - int start_index = GetStartIndex(); - for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - // FIXME - i %= image_label_pairs_.size(); - framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); - ret.push_back(std::move(tmp)); - } - return ret; - } - - // LoDTensorArray Next() { - // LoDTensorArray batch_data; - // batch_buffer_.Pull(&batch_data); - // return batch_data; - // } - // - void LoadBatch(LoDTensorBlockingQueue* queue) { - // std::cout << "start LoadBatch 0.01" << std::endl; - LoDTensorArray batch_data = std::move(Read()); - queue->Push(batch_data); - // return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; - } - - private: - int batch_size_; - std::string file_root_, file_list_; - std::vector> image_label_pairs_; - int current_epoch_; - int current_iter_; - int rank_; - int world_size_; - int iters_per_epoch_; - std::atomic is_closed_; - Buffer batch_buffer_; - std::thread load_thrd_; -}; - -class FileDataReaderWrapper { - public: - void SetUp(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue) { - reader.reset(new FileDataReader(ctx, queue)); - } - - std::shared_ptr reader = nullptr; -}; - FileDataReaderWrapper reader_wrapper; template diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h new file mode 100644 index 00000000000000..2dac1d00cca023 --- /dev/null +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -0,0 +1,219 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { + +using LoDTensorArray = framework::LoDTensorArray; +using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; + +enum BufferStatus { + kBufferStatusSuccess = 0, + kBufferStatusErrorClosed, + kBufferStatusEmpty +}; + +template +class Buffer final { + public: + explicit Buffer(size_t max_len = 2) : max_len_(max_len), is_closed_(false) {} + ~Buffer() = default; + + BufferStatus Push(const T& item); + BufferStatus Pull(T* item); + BufferStatus TryReceive(T* item); + void Close(); + + private: + std::queue queue_; + mutable std::mutex mutex_; + size_t max_len_; + bool is_closed_; + std::condition_variable cond_; +}; + +template +BufferStatus Buffer::Push(const T& item) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]() { return queue_.size() < max_len_ || is_closed_; }); + if (is_closed_) { + return kBufferStatusErrorClosed; + } + + queue_.push(item); + cond_.notify_one(); + return kBufferStatusSuccess; +} + +template +BufferStatus Buffer::Pull(T* item) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this]() { return (!queue_.empty()) || is_closed_; }); + if (queue_.empty()) { + return kBufferStatusErrorClosed; + } + *item = queue_.front(); + queue_.pop(); + if (queue_.size() < max_len_) { + cond_.notify_all(); + } + return kBufferStatusSuccess; +} + +template +void Buffer::Close() { + std::unique_lock lock(mutex_); + is_closed_ = true; + cond_.notify_all(); +} + +class FileDataReader { + public: + explicit FileDataReader(const framework::ExecutionContext& ctx, + LoDTensorBlockingQueue* queue) + : queue_(queue) { + std::vector files = + ctx.Attr>("files"); + std::vector labels = ctx.Attr>("labels"); + rank_ = ctx.Attr("rank"); + world_size_ = ctx.Attr("world_size"); + // std::cout << "files and labels size: " << files.size() << " " + // << labels.size() << std::endl; + batch_size_ = ctx.Attr("batch_size"); + current_epoch_ = 0; + current_iter_ = 0; + iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); + is_closed_ = false; + for (int i = 0, n = files.size(); i < n; i++) + image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); + StartLoadThread(); + } + + int GetStartIndex() { + int start_idx = + batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + + rank_ * batch_size_; + current_iter_++; + return start_idx; + } + + framework::LoDTensor ReadSample(const std::string filename) { + std::ifstream input(filename.c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + std::streamsize file_size = input.tellg(); + + input.seekg(0, std::ios::beg); + + // auto* out = ctx.Output("Out"); + framework::LoDTensor out; + std::vector out_shape = {file_size}; + out.Resize(framework::make_ddim(out_shape)); + + uint8_t* data = out.mutable_data(platform::CPUPlace()); + + input.read(reinterpret_cast(data), file_size); + return out; + } + + void StartLoadThread() { + if (load_thrd_.joinable()) { + return; + } + + load_thrd_ = std::thread([this] { + while (!is_closed_.load()) LoadBatch(); + }); + } + + void ShutDown() { + LOG(ERROR) << "FileDataReader shutdown enter"; + if (queue_) queue_->Close(); + + if (load_thrd_.joinable()) { + load_thrd_.join(); + } + } + + LoDTensorArray Read() { + LoDTensorArray ret; + ret.reserve(batch_size_); + int start_index = GetStartIndex(); + for (int32_t i = start_index; i < start_index + batch_size_; ++i) { + // FIXME + i %= image_label_pairs_.size(); + framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); + ret.push_back(std::move(tmp)); + } + return ret; + } + + // LoDTensorArray Next() { + // LoDTensorArray batch_data; + // batch_buffer_.Pull(&batch_data); + // return batch_data; + // } + // + void LoadBatch() { + // std::cout << "start LoadBatch 0.01" << std::endl; + LoDTensorArray batch_data = std::move(Read()); + queue_->Push(batch_data); + // return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; + } + + private: + int batch_size_; + std::string file_root_, file_list_; + std::vector> image_label_pairs_; + int current_epoch_; + int current_iter_; + int rank_; + int world_size_; + int iters_per_epoch_; + std::atomic is_closed_; + Buffer batch_buffer_; + std::thread load_thrd_; + LoDTensorBlockingQueue* queue_; +}; + +class FileDataReaderWrapper { + public: + void SetUp(const framework::ExecutionContext& ctx, + LoDTensorBlockingQueue* queue) { + reader.reset(new FileDataReader(ctx, queue)); + } + + std::shared_ptr reader = nullptr; + + void ShutDown() { + LOG(ERROR) << "FileDataReaderWrapper shutdown enter"; + reader->ShutDown(); + } +}; + + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index 4c5d88b18ddfb6..f9e42c83586c54 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -48,6 +48,7 @@ extern void *nvjpeg_dso_handle; __macro(nvjpegBufferDeviceCreate); \ __macro(nvjpegBufferPinnedCreate); \ __macro(nvjpegDecodeParamsSetOutputFormat); \ + __macro(nvjpegDecodeParamsSetROI); \ __macro(nvjpegStateAttachPinnedBuffer); \ __macro(nvjpegStateAttachDeviceBuffer); \ __macro(nvjpegJpegStreamParse); \ diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6c4ba10e90e216..498b7d607174af 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -65,6 +65,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/operators/data/shutdown.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device_context.h" @@ -681,6 +682,9 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); + m.def("_shutdown_dataloader", + &paddle::operators::data::ShutDownDataLoader); + BindImperative(&m); py::class_(m, "Tensor", py::buffer_protocol()) diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 1e153cf0f9747a..5e98e496e200e8 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -277,6 +277,7 @@ def to_list(s): from .core_avx import _device_synchronize from .core_avx import _get_current_stream from .core_avx import _set_current_stream + from .core_avx import _shutdown_dataloader if sys.platform != 'win32': from .core_avx import _set_process_pids from .core_avx import _erase_process_pids @@ -330,6 +331,7 @@ def to_list(s): from .core_noavx import _device_synchronize from .core_noavx import _get_current_stream from .core_noavx import _set_current_stream + from .core_noavx import _shutdown_dataloader if sys.platform != 'win32': from .core_noavx import _set_process_pids from .core_noavx import _erase_process_pids diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 2e440f0e82807d..47ce0f4d49cc9f 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -20,12 +20,16 @@ from paddle import _C_ops from paddle.fluid import core, framework from paddle.fluid.layers.utils import _hash_with_id +from ..multiprocess_utils import CleanupFuncRegistrar from collections.abc import Sequence, Mapping __all__ = ["Pipeline"] +CleanupFuncRegistrar.register(core._shutdown_dataloader) + + class Pipeline: """ Data pipeline @@ -108,13 +112,16 @@ def __next__(self): # try: import sys import time + # try: tic = time.time() _C_ops.dataloader(self._output_vars, *self._attrs) toc = time.time() print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) sys.stdout.flush() - # except KeyboardInterrupt: - # pass + # except: + # print("_C_ops dataloader except enter") + # sys.stdout.flush() + # core._shutdown_dataloader() return {k: v for k, v in zip(self._out_names, self._output_vars)} diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 3f6332e9f35286..0474144b9c2105 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -31,6 +31,7 @@ 'DeformConv2D', 'read_file', 'decode_jpeg', + 'image_decode_random_crop', 'random_flip', 'roi_pool', 'RoIPool', @@ -920,7 +921,15 @@ def file_label_reader(file_root, batch_size, name=None): return out -def image_decode(x, mode='unchanged', num_threads=2, name=None): +def image_decode_random_crop(x, + mode='unchanged', + num_threads=2, + aspect_ratio_min=3./4., + aspect_ratio_max=4./3., + area_min=0.08, + area_max=1., + num_attempts=10, + name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. @@ -931,6 +940,12 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): of the JPEG image. mode (str): The read mode used for optionally converting the image. Default: 'unchanged'. + num_threads (int): parallel thread number. + aspect_ratio_min (float): + aspect_ratio_max (float): + area_min (float): + area_max (float): + num_attempts (int): name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -954,19 +969,30 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): """ if in_dygraph_mode(): - return _C_ops.decode(x, "mode", mode) + return _C_ops.batch_decode_random_crop( + x, "mode", mode, "num_threads", num_threads, + "aspect_ratio_min", aspect_ratio_min, + "aspect_ratio_max", aspect_ratio_max, + "area_min", area_min, "area_max", area_max, + "num_attempts", num_attempts) inputs = {'X': x} - attrs = {"mode": mode, "num_threads": num_threads} - - helper = LayerHelper("batch_decode", **locals()) + attrs = {"mode": mode, + "num_threads": num_threads, + "aspect_ratio_min": aspect_ratio_min, + "aspect_ratio_max", aspect_ratio_max, + "area_min", area_min + "area_max", area_max, + "num_attempts", num_attempts} + + helper = LayerHelper("batch_decode_random_crop", **locals()) out = helper.create_variable( - name=unique_name.generate("image_decode"), + name=unique_name.generate("image_decode_random_crop"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype=x.dtype) # out = helper.create_variable_for_type_inference('uint8') helper.append_op( - type="batch_decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) + type="batch_decode_random_crop", inputs=inputs, attrs=attrs, outputs={"Out": out}) return out From cec2758f05ea131582616b918f94af93227f36a8 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 3 Jan 2022 08:14:14 +0000 Subject: [PATCH 33/95] fix typo --- python/paddle/vision/ops.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 0474144b9c2105..fccb84b10713fd 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -980,10 +980,10 @@ def image_decode_random_crop(x, attrs = {"mode": mode, "num_threads": num_threads, "aspect_ratio_min": aspect_ratio_min, - "aspect_ratio_max", aspect_ratio_max, - "area_min", area_min - "area_max", area_max, - "num_attempts", num_attempts} + "aspect_ratio_max": aspect_ratio_max, + "area_min": area_min, + "area_max": area_max, + "num_attempts": num_attempts} helper = LayerHelper("batch_decode_random_crop", **locals()) out = helper.create_variable( From 3eddb611b19a123edeaf8b7ae07d85c118e7e99d Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 4 Jan 2022 13:49:45 +0000 Subject: [PATCH 34/95] fix exit segmentfault, need update --- paddle/fluid/operators/data/map_runner.cc | 10 ++++---- paddle/fluid/operators/data/nvjpeg_decoder.cc | 23 ++++++++++++------- paddle/fluid/operators/data/nvjpeg_decoder.h | 2 +- paddle/fluid/operators/data/shutdown.h | 17 +++++++++----- paddle/fluid/operators/file_label_reader_op.h | 3 ++- python/paddle/fluid/dataloader/pipeline.py | 4 +++- 6 files changed, 37 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 5c687eecfec79b..3d9260581f89f7 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -119,11 +119,11 @@ void MapRunner::StartMapThread(const Scope* scope) { ShutDown(); return false; } - LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; + // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; // Step 2: run ops by executor without fetch - executor.Run(*map_block_->Program(), &scope_, map_block_->ID(), false, true, std::vector(), false, true); - LOG(ERROR) << "MapThread Loop " << program_id_ << " program run finish"; + executor.Run(*map_block_->Program(), &scope_, static_cast(map_block_->ID()), false, true, std::vector(), false, true); + // LOG(ERROR) << "MapThread Loop " << program_id_ << " program run finish"; // Step 3: fetch output variable to LoDTensor vector // and push to output queue @@ -149,10 +149,10 @@ void MapRunner::StartMapThread(const Scope* scope) { output_queues_[i]->Push(t_arr); } } - LOG(ERROR) << "MapThread Loop " << program_id_ << " push queue finish"; + // LOG(ERROR) << "MapThread Loop " << program_id_ << " push queue finish"; } scope->DeleteScope(&scope_); - LOG(ERROR) << "MapThread Loop " << program_id_ << " delete scope and return"; + // LOG(ERROR) << "MapThread Loop " << program_id_ << " delete scope and return"; return true; })); } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 4becec40416ddb..09cd8fcf32ebd1 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -201,17 +201,22 @@ void NvjpegDecoderThreadPool::WaitTillTasksCompleted() { } void NvjpegDecoderThreadPool::ShutDown() { - std::lock_guard lock(mutex_); + // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown enter"; + std::unique_lock lock(mutex_); running_ = false; - shutdown_.store(true); + shutdown_ = true; running_cond_.notify_all(); + lock.unlock(); - task_queue_.clear(); - - for (auto &thread : threads_) { + for (auto& thread : threads_) { + // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown thread join, shutdown_ " << shutdown_; thread.join(); + // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown thread join finish 1"; } + + task_queue_.clear(); + // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown finish"; } void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { @@ -226,10 +231,12 @@ void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { NvjpegDecoder* decoder = new NvjpegDecoder(mode_); - while (!shutdown_.load()) { + while (!shutdown_) { std::unique_lock lock(mutex_); - running_cond_.wait(lock, [this] { return running_ && !task_queue_.empty(); }); - if (shutdown_.load()) break; + // LOG(ERROR) << "ThreadLoop wait running_cond_"; + running_cond_.wait(lock, [this] { return (running_ && !task_queue_.empty()) || shutdown_; }); + // LOG(ERROR) << "ThreadLoop shutdown_ " << shutdown_; + if (shutdown_) break; auto task = task_queue_.front(); task_queue_.pop_front(); diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h index ebcf1762ade55f..92911e1b6ce715 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -106,7 +106,7 @@ class NvjpegDecoderThreadPool { std::deque> task_queue_; std::mutex mutex_; - std::atomic shutdown_; + bool shutdown_; std::condition_variable running_cond_; bool running_; std::condition_variable completed_cond_; diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 22fa6b46add5bb..90a0eab103ede0 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -25,21 +25,26 @@ namespace operators { extern FileDataReaderWrapper reader_wrapper; namespace data { + extern NvjpegDecoderThreadPool* decode_pool; void ShutDownDataLoader() { LOG(ERROR) << "ShutDownDataLoader enter"; // step 1: shutdown reader reader_wrapper.ShutDown(); + LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; - // step 2: shutdown decoder - decode_pool->ShutDown(); + // // step 2: shutdown decoder + // decode_pool->ShutDown(); + // LOG(ERROR) << "ShutDownDataLoader decode_pool shutdown finish"; - // step 3: shutdown MapRunner - MapRunnerManager::Instance()->ShutDown(); + // // step 3: shutdown MapRunner + // MapRunnerManager::Instance()->ShutDown(); + // LOG(ERROR) << "ShutDownDataLoader MapRunner shutdown finish"; - // step 4: shutdown pipeline - PipelineManager::Instance()->ShutDown(); + // // step 4: shutdown pipeline + // PipelineManager::Instance()->ShutDown(); + // LOG(ERROR) << "ShutDownDataLoader PipelineManager shutdown finish"; } } // namespace data diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h index 2dac1d00cca023..2354db019ee3be 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -150,9 +150,9 @@ class FileDataReader { } void ShutDown() { - LOG(ERROR) << "FileDataReader shutdown enter"; if (queue_) queue_->Close(); + is_closed_.store(true); if (load_thrd_.joinable()) { load_thrd_.join(); } @@ -211,6 +211,7 @@ class FileDataReaderWrapper { void ShutDown() { LOG(ERROR) << "FileDataReaderWrapper shutdown enter"; reader->ShutDown(); + LOG(ERROR) << "FileDataReaderWrapper shutdown finish"; } }; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 47ce0f4d49cc9f..248d31a2ad38e5 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -112,7 +112,6 @@ def __next__(self): # try: import sys import time - # try: tic = time.time() _C_ops.dataloader(self._output_vars, *self._attrs) toc = time.time() @@ -128,3 +127,6 @@ def __next__(self): # Python 2 compatable def next(self): return self.__next__() + + def __del__(self): + core._shutdown_dataloader() From 887e749b8b88928c8cb04abf3b1ed223d23a2cb4 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Wed, 5 Jan 2022 09:08:18 +0000 Subject: [PATCH 35/95] add label and multi-gpu --- .../data/batch_decode_random_crop_op.cc | 3 ++ .../data/batch_decode_random_crop_op.cu | 9 ++-- paddle/fluid/operators/data/nvjpeg_decoder.cc | 8 ++-- paddle/fluid/operators/data/nvjpeg_decoder.h | 6 ++- .../fluid/operators/file_label_reader_op.cc | 14 +++++- paddle/fluid/operators/file_label_reader_op.h | 47 +++++++++++++++---- python/paddle/fluid/dataloader/pipeline.py | 4 +- python/paddle/vision/ops.py | 19 ++++++-- 8 files changed, 86 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 27677e5c4bad08..543b97ed91ad15 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -121,6 +121,9 @@ or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 between 0 and 255. )DOC"); + AddAttr("local_rank", + "(int64_t)" + "The index of the op to start execution"); AddAttr("num_threads", "Path of the file to be readed.") .SetDefault(2); AddAttr( diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index e8fe313d6c4883..d8e28c0258ca43 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -33,11 +33,11 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { int num_threads = ctx.Attr("num_threads"); LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); - + auto local_rank = ctx.Attr("local_rank"); // multi-phrase decode thread pool if (!decode_pool) { LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; - decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); + decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); // rand_seq = new std::seed_seq(static_cast(time(0))); } @@ -45,6 +45,8 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { ctx.Input("X"); auto* out = ctx.OutputVar("Out"); + auto dev = platform::CUDAPlace(local_rank); + auto& out_array = *out->GetMutable(); out_array.resize(inputs->size()); @@ -90,7 +92,8 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { .tensor = &out_array[i], .roi_generator = new RandomROIGenerator( aspect_ratio_range, area_range, rands[i]), - .place = ctx.GetPlace() + .place = dev + // .place = ctx.GetPlace() }; decode_pool->AddTask(std::make_shared(task)); } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 4becec40416ddb..456c5e66755f9b 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -18,11 +18,12 @@ namespace paddle { namespace operators { namespace data { -NvjpegDecoder::NvjpegDecoder(std::string mode) +NvjpegDecoder::NvjpegDecoder(std::string mode, int dev_id) : nvjpeg_streams_(2), pinned_buffers_(2), page_id_(0), mode_(mode) { + platform::SetDeviceId(dev_id); // create cuda stream PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking)); @@ -158,9 +159,10 @@ void NvjpegDecoder::Run( Decode(bit_stream, bit_len, &image); } -NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode) +NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id) : threads_(num_threads), mode_(mode), + dev_id_(dev_id), shutdown_(false), running_(false), completed_(false), @@ -224,7 +226,7 @@ void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { } void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { - NvjpegDecoder* decoder = new NvjpegDecoder(mode_); + NvjpegDecoder* decoder = new NvjpegDecoder(mode_, dev_id_); while (!shutdown_.load()) { std::unique_lock lock(mutex_); diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h index ebcf1762ade55f..466e94ce17f06d 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/fluid/platform/stream/cuda_stream.h" @@ -43,7 +44,7 @@ struct NvjpegDecodeTask { class NvjpegDecoder { public: - NvjpegDecoder(const std::string mode); + NvjpegDecoder(const std::string mode, int dev_id); ~NvjpegDecoder(); @@ -81,7 +82,7 @@ class NvjpegDecoder { class NvjpegDecoderThreadPool { public: - NvjpegDecoderThreadPool(const int num_threads, const std::string mode); + NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id); ~NvjpegDecoderThreadPool(); @@ -102,6 +103,7 @@ class NvjpegDecoderThreadPool { std::vector threads_; std::string mode_; + int dev_id_; std::deque> task_queue_; std::mutex mutex_; diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc index 55fe8c8f8ea1a2..a539d015390e08 100644 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ b/paddle/fluid/operators/file_label_reader_op.cc @@ -65,9 +65,20 @@ class FileLabelReaderOp : public framework::OperatorBase { out_queue = holder->GetQueue(); } + auto* out_label = scope.FindVar(Output("Label")); + auto out_label_queue = + out_label->Get().GetQueue(); + if (out_label_queue == nullptr) { + LOG(ERROR) << "FileLabelReaderOp init output label queue"; + auto* label_holder = + out_label->template GetMutable(); + label_holder->InitOnce(2); + out_label_queue = label_holder->GetQueue(); + } + if (reader_wrapper.reader == nullptr) { // create reader - reader_wrapper.SetUp(ctx, out_queue.get()); + reader_wrapper.SetUp(ctx, out_queue.get(), out_label_queue.get()); } // LoDTensorArray samples = reader_wrapper.reader->Next(); // framework::LoDTensorArray out_array; @@ -94,6 +105,7 @@ class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddOutput("Out", "The output tensor of ReadFile op"); + AddOutput("Label", "The output tensor of ReadFile op"); AddComment(R"DOC( This operator read a file. )DOC"); diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h index 2dac1d00cca023..7976ad036387c8 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -94,8 +94,8 @@ void Buffer::Close() { class FileDataReader { public: explicit FileDataReader(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue) - : queue_(queue) { + LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) + : queue_(queue), label_queue_(label_queue){ std::vector files = ctx.Attr>("files"); std::vector labels = ctx.Attr>("labels"); @@ -158,8 +158,21 @@ class FileDataReader { } } - LoDTensorArray Read() { + // LoDTensorArray Read() { + // LoDTensorArray ret; + // ret.reserve(batch_size_); + // int start_index = GetStartIndex(); + // for (int32_t i = start_index; i < start_index + batch_size_; ++i) { + // // FIXME + // i %= image_label_pairs_.size(); + // framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); + // ret.push_back(std::move(tmp)); + // } + // return ret; + // } + std::pair> Read() { LoDTensorArray ret; + std::vector label; ret.reserve(batch_size_); int start_index = GetStartIndex(); for (int32_t i = start_index; i < start_index + batch_size_; ++i) { @@ -167,8 +180,9 @@ class FileDataReader { i %= image_label_pairs_.size(); framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); ret.push_back(std::move(tmp)); + label.push_back(image_label_pairs_[i].second); } - return ret; + return std::make_pair(ret, label); } // LoDTensorArray Next() { @@ -179,9 +193,23 @@ class FileDataReader { // void LoadBatch() { // std::cout << "start LoadBatch 0.01" << std::endl; - LoDTensorArray batch_data = std::move(Read()); - queue_->Push(batch_data); - // return batch_buffer_.Push(batch_data) == BufferStatus::kBufferStatusSuccess; + // LoDTensorArray batch_data = std::move(Read()); + // queue_->Push(batch_data); + + auto batch_data = std::move(Read()); + queue_->Push(batch_data.first); + framework::LoDTensor label_tensor; + LoDTensorArray label_array; + // auto& label_tensor = label.GetMutable(); + label_tensor.Resize( + framework::make_ddim({static_cast(batch_data.first.size())})); + platform::CPUPlace cpu; + auto* label_data = label_tensor.mutable_data(cpu); + for (size_t i = 0; i < batch_data.first.size(); ++i) { + label_data[i] = batch_data.second[i]; + } + label_array.push_back(label_tensor); + label_queue_->Push(label_array); } private: @@ -197,13 +225,14 @@ class FileDataReader { Buffer batch_buffer_; std::thread load_thrd_; LoDTensorBlockingQueue* queue_; + LoDTensorBlockingQueue* label_queue_; }; class FileDataReaderWrapper { public: void SetUp(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue) { - reader.reset(new FileDataReader(ctx, queue)); + LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { + reader.reset(new FileDataReader(ctx, queue, label_queue)); } std::shared_ptr reader = nullptr; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 47ce0f4d49cc9f..28d30f5c763528 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -63,7 +63,9 @@ def __exit__(self, exception_type, exception_value, traceback): self._main_program = framework.switch_main_program(self._main_program) self._startup_program = framework.switch_startup_program( self._startup_program) - paddle.disable_static() + + local_rank = paddle.distributed.get_rank() + paddle.disable_static("gpu:" + str(local_rank)) def set_outputs(self, outputs): if isinstance(outputs, Sequence): diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index fccb84b10713fd..9cbc00e1dd9229 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -912,13 +912,21 @@ def file_label_reader(file_root, batch_size, name=None): name=unique_name.generate("file_label_reader"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype='uint8') + + label = helper.create_variable( + name=unique_name.generate("file_label_reader"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='int') + helper.append_op( type="file_label_reader", inputs=inputs, attrs=attrs, - outputs={"Out": out}) + outputs={"Out": out, + "Label": label + }) - return out + return out, label def image_decode_random_crop(x, @@ -967,14 +975,14 @@ def image_decode_random_crop(x, print(img.shape) """ - + local_rank = paddle.distributed.get_rank() if in_dygraph_mode(): return _C_ops.batch_decode_random_crop( x, "mode", mode, "num_threads", num_threads, "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, "area_max", area_max, - "num_attempts", num_attempts) + "num_attempts", num_attempts, "local_rank", local_rank) inputs = {'X': x} attrs = {"mode": mode, @@ -983,7 +991,8 @@ def image_decode_random_crop(x, "aspect_ratio_max": aspect_ratio_max, "area_min": area_min, "area_max": area_max, - "num_attempts": num_attempts} + "num_attempts": num_attempts, + "local_rank": local_rank} helper = LayerHelper("batch_decode_random_crop", **locals()) out = helper.create_variable( From 632e8b0ebfb356fc79567298ef10e63d61efd880 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 5 Jan 2022 14:27:31 +0000 Subject: [PATCH 36/95] add batch_resize/batch_decode --- paddle/fluid/operators/data/CMakeLists.txt | 4 + .../fluid/operators/data/batch_decode_op.cc | 108 +++++++ .../fluid/operators/data/batch_decode_op.cu | 98 +++++++ paddle/fluid/operators/data/batch_decode_op.h | 44 +++ .../data/batch_decode_random_crop_op.cu | 21 +- .../data/batch_decode_random_crop_op.h | 2 +- .../fluid/operators/data/batch_resize_op.cc | 112 +++++++ .../fluid/operators/data/batch_resize_op.cu | 277 ++++++++++++++++++ paddle/fluid/operators/data/batch_resize_op.h | 38 +++ paddle/fluid/operators/data/nvjpeg_decoder.cc | 11 +- .../{ => data}/random_crop_and_resize_op.cc | 20 +- .../{ => data}/random_crop_and_resize_op.cu | 8 +- .../{ => data}/random_crop_and_resize_op.h | 19 ++ paddle/fluid/operators/file_label_reader_op.h | 4 +- python/paddle/vision/ops.py | 139 ++++++++- 15 files changed, 858 insertions(+), 47 deletions(-) create mode 100644 paddle/fluid/operators/data/batch_decode_op.cc create mode 100644 paddle/fluid/operators/data/batch_decode_op.cu create mode 100644 paddle/fluid/operators/data/batch_decode_op.h create mode 100644 paddle/fluid/operators/data/batch_resize_op.cc create mode 100644 paddle/fluid/operators/data/batch_resize_op.cu create mode 100644 paddle/fluid/operators/data/batch_resize_op.h rename paddle/fluid/operators/{ => data}/random_crop_and_resize_op.cc (90%) rename paddle/fluid/operators/{ => data}/random_crop_and_resize_op.cu (98%) rename paddle/fluid/operators/{ => data}/random_crop_and_resize_op.h (68%) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 6c0806249d8345..5f24e27d2ef866 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -13,6 +13,10 @@ op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) +op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) + +op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) +op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc new file mode 100644 index 00000000000000..6d97e4fa426c00 --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/batch_decode_op.h" + +namespace paddle { +namespace operators { +namespace data { + +class BatchDecodeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + // auto mode = ctx->Attrs().Get("mode"); + // std::vector out_dims; + // + // if (mode == "unchanged") { + // out_dims = {-1, -1, -1}; + // } else if (mode == "gray") { + // out_dims = {1, -1, -1}; + // } else if (mode == "rgb") { + // out_dims = {3, -1, -1}; + // } else { + // PADDLE_THROW(platform::errors::Fatal( + // "The provided mode is not supported for JPEG files on GPU: ", mode)); + // } + // + // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::UINT8, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class BatchDecodeInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + } +}; + +class BatchDecodeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr("num_threads", "Path of the file to be readed.") + .SetDefault(2); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + batch_decode, ops::data::BatchDecodeOp, ops::data::BatchDecodeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(batch_decode, ops::data::CPUBatchDecodeKernel) diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu new file mode 100644 index 00000000000000..6b906521b51d6a --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -0,0 +1,98 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) + +#include "paddle/fluid/operators/data/batch_decode_random_crop_op.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace data { + +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; + +static NvjpegDecoderThreadPool* decode_pool = nullptr; + +template +class GPUBatchDecodeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int num_threads = ctx.Attr("num_threads"); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; + auto mode = ctx.Attr("mode"); + + // multi-phrase decode thread pool + if (!decode_pool) { + LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; + decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); + } + + const framework::LoDTensorArray* inputs = + ctx.Input("X"); + + auto* out = ctx.OutputVar("Out"); + auto& out_array = *out->GetMutable(); + out_array.resize(inputs->size()); + + for (size_t i = 0; i < inputs->size(); i++) { + const framework::LoDTensor x = inputs->at(i); + auto* x_data = x.data(); + size_t x_numel = static_cast(x.numel()); + + NvjpegDecodeTask task = { + .bit_stream = x_data, + .bit_len = x_numel, + .tensor = &out_array[i], + .roi_generator = nullptr, + .place = ctx.GetPlace() + }; + decode_pool->AddTask(std::make_shared(task)); + } + + decode_pool->RunAll(true); + // out_queue->Push(out_array); + + // // multi-phrase decode single thread + // if (!nvjpeg_decoder) { + // nvjpeg_decoder = new NvjpegDecoder(mode); + // } + // + // const framework::LoDTensorArray* inputs = + // ctx.Input("X"); + // + // auto* out = ctx.OutputVar("Out"); + // auto& out_array = *out->GetMutable(); + // out_array.resize(inputs->size()); + // + // for (size_t i = 0; i < inputs->size(); i++) { + // const framework::LoDTensor x = inputs->at(i); + // auto* x_data = x.data(); + // + // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), + // &out_array[i], &ctx); + // } + + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(batch_decode, ops::data::GPUBatchDecodeKernel) + +#endif diff --git a/paddle/fluid/operators/data/batch_decode_op.h b/paddle/fluid/operators/data/batch_decode_op.h new file mode 100644 index 00000000000000..a16385b594c293 --- /dev/null +++ b/paddle/fluid/operators/data/batch_decode_op.h @@ -0,0 +1,44 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/data/nvjpeg_decoder.h" + + +namespace paddle { +namespace operators { +namespace data { + +template +class CPUBatchDecodeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "BatchDecode op only supports GPU now.")); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index e8fe313d6c4883..4199705fe59267 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -23,7 +23,7 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; -NvjpegDecoderThreadPool* decode_pool = nullptr; +static NvjpegDecoderThreadPool* decode_pool = nullptr; // std::seed_seq* rand_seq = nullptr; template @@ -60,25 +60,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { std::vector rands(inputs->size()); rand_seq.generate(rands.begin(), rands.end()); - // auto* in_var = ctx.InputVar("X"); - // auto in_queue = in_var->Get().GetQueue(); - // - // auto* out_var = ctx.OutputVar("Out"); - // auto out_queue = out_var->Get().GetQueue(); - // if (out_queue == nullptr) { - // LOG(ERROR) << "decode init output queue"; - // auto* holder = out_var->template GetMutable(); - // holder->InitOnce(2); - // out_queue = holder->GetQueue(); - // } - // - // bool success = true; - // auto inputs = in_queue->Pop(&success); - // PADDLE_ENFORCE_EQ(success, true, - // platform::errors::PreconditionNotMet("Read from input queue failed")); - // framework::LoDTensorArray out_array; - // out_array.resize(inputs.size()); - for (size_t i = 0; i < inputs->size(); i++) { const framework::LoDTensor x = inputs->at(i); auto* x_data = x.data(); diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h index 592adf1c563da7..fd23be38341dc9 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -35,7 +35,7 @@ class CPUBatchDecodeRandomCropKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { // TODO(LieLinJiang): add cpu implement. PADDLE_THROW(platform::errors::Unimplemented( - "DecodeJpeg op only supports GPU now.")); + "BatchDecodeRandomCrop op only supports GPU now.")); } }; diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc new file mode 100644 index 00000000000000..afc21cd0c5a1c0 --- /dev/null +++ b/paddle/fluid/operators/data/batch_resize_op.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/batch_resize_op.h" + +namespace paddle { +namespace operators { +namespace data { + +class BatchResizeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchResize"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "BatchResize"); + + auto size = ctx->Attrs().Get>("size"); + PADDLE_ENFORCE_EQ(size.size(), 2, + platform::errors::InvalidArgument( + "The length of Attrs(size) should be 2.")); + PADDLE_ENFORCE_GT(size[0], 0, + platform::errors::InvalidArgument( + "h in Attr(size) of Op(BatchResize) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(size[1], 0, + platform::errors::InvalidArgument( + "w in Attr(size) of Op(BatchResize) " + "should be greater than 0.")); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::UINT8, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "X") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); + AddOutput("Out", "(Tensor). The cropped instance batch."); + AddAttr>( + "size", "expected output size of the crop, for each edge."); + AddAttr("interp_method", + "(string, default \"bilinear\"), interpolation " + "method, can be \"bilinear\" for " + "bilinear interpolation and \"nearest\" for nearest " + "neighbor interpolation.") + .SetDefault("bilinear"); + AddAttr( + "align_corners", + "an optional bool. Defaults to True. " + "If True, the centers of 4 corner pixels of the input and output " + "tensors are aligned, preserving the values at the corner pixels, " + "If False, are not aligned") + .SetDefault(true); + AddAttr("align_mode", + "(int, default \'1\'), optional for bilinear interpolation, " + "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " + "can be \'1\' for src_idx = scale*dst_index .") + .SetDefault(1); + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); + AddComment(R"DOC( + Batch resize images + )DOC"); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + batch_resize, ops::data::BatchResizeOp, + ops::data::BatchResizeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(batch_resize, + ops::data::BatchResizeCPUKernel) diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu new file mode 100644 index 00000000000000..e2c0319fdcf051 --- /dev/null +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -0,0 +1,277 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/batch_resize_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" + +namespace paddle { +namespace operators { +namespace data { + +using framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +__global__ void KeNearestNeighborInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w, + const bool align_corners, const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (; tid < nthreads; tid += stride) { + // batch size + int out_id_h = tid / output_w; + // single image's index + int out_id_w = tid % output_w; + // input_w or output_w = c * h * w + // img_size = h * w + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + // get output c, h, w index + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + // get input h index with offset + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + // get input w index with offset + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + + if (data_layout == DataLayout::kNCHW) { + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } else { + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + } +} + +template +__global__ void KeBilinearInterpFw( + const T* in, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, T* out, + const size_t out_img_h, const size_t out_img_w, const size_t output_h, + const size_t output_w, const size_t num_channels, const float ratio_h, + const float ratio_w, const bool align_corners, const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + // batch size + int out_id_h = tid / output_w; + // single image's index + int out_id_w = tid % output_w; + // input_w or output_w = c * h * w + // img_size = h * w + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; + + // get output c, h, w index + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + + // get input h index with offset + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = in_img_idy > 0 ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = src_h > 0 ? src_h : 0; + T h1lambda = align_flag ? src_h - in_img_idy + : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; + + // get input w index with offset + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = in_img_idx > 0 ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = src_w > 0 ? src_w : 0; + T w1lambda = align_flag ? src_w - in_img_idx + : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + } else { + const T* in_pos = + &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * + (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * in_pos[h_id * in_img_w * num_channels + + w_id * num_channels]); + } + } +} + +template +static void ResizeFwd( + const framework::ExecutionContext& ctx, const framework::LoDTensor& input, + framework::Tensor* output, const std::vector out_size, + const std::string interp_method, const bool align_corners, + const int align_mode, const int img_h, const int img_w, const int c, + const DataLayout data_layout) { + auto input_data = input.template data(); + int out_h = static_cast(out_size[0]); + int out_w = static_cast(out_size[1]); + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {c, out_h, out_w}; + } else { + dim_out = {out_h, out_w, c}; + } + auto output_data = output->data(); + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(img_h - 1) / (out_h - 1) + : static_cast(img_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(img_w - 1) / (out_w - 1) + : static_cast(img_w) / out_w; + } + + int in_chw = c * img_h * img_w; + int out_chw = c * out_h * out_w; + + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_chw); + + if ("nearest" == interp_method) { + KeNearestNeighborInterpFw< + T><<>>( + input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1, + out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + } else if ("bilinear" == interp_method) { + KeBilinearInterpFw<<>>( + input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1, + out_chw, c, ratio_h, ratio_w, align_corners, align_mode, + data_layout); + } +} + +template +class BatchResizeCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "BatchResizeCUDAKernel Compute start"; + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::NotFound("This kernel only runs on GPU device.")); + // get input, output + auto* x = ctx.Input("X"); + PADDLE_ENFORCE_GT(x->size(), 0, + platform::errors::InvalidArgument( + "The size of X must be greater than 0.")); + auto* out = ctx.Output("Out"); + + // get size, scale, ratio + auto size = ctx.Attr>("size"); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + // get interpolation method + const std::string interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + auto* img = &x->at(0); + int64_t img_c = data_layout == DataLayout::kNCHW ? \ + img->dims()[0] : img->dims()[2]; + + std::vector out_dim = {static_cast(x->size()), + img_c, size[0], size[1]}; + out->Resize(framework::make_ddim(out_dim)); + out->mutable_data(ctx.GetPlace()); + + int img_h, img_w, idx_h, idx_w, crop_h, crop_w; + for (int i = 0; i < x->size(); i++) { + img = &x->at(i); + img_h = + data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; + img_w = + data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; + // GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, + // &crop_w, seed); + + auto out_tensor = out->Slice(i, i + 1); + ResizeFwd(ctx, *img, &out_tensor, size, interp_method, + align_corners, align_mode, img_h, img_w, img_c, + data_layout); + } + + // framework::LoDTensorArray out_array; + // out_array.reserve(1); + // out_array.emplace_back(out); + // out_queue->Push(out_array); + LOG(ERROR) << "BatchResizeCUDAKernel Compute finish"; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(batch_resize, + ops::data::BatchResizeCUDAKernel, + ops::data::BatchResizeCUDAKernel); diff --git a/paddle/fluid/operators/data/batch_resize_op.h b/paddle/fluid/operators/data/batch_resize_op.h new file mode 100644 index 00000000000000..89c6b5bb0c949c --- /dev/null +++ b/paddle/fluid/operators/data/batch_resize_op.h @@ -0,0 +1,38 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace data { + +template +class BatchResizeCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // no cpu kernel. + PADDLE_THROW(platform::errors::Unimplemented( + "BatchResize op only supports GPU now.")); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 09cd8fcf32ebd1..f2d36cab23d1da 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -116,12 +116,13 @@ void NvjpegDecoder::ParseDecodeParams( PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); - ROI roi; - roi_generator->GenerateRandomROI(width, height, &roi); + if (roi_generator) { + ROI roi; + roi_generator->GenerateRandomROI(width, height, &roi); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); - - std::vector out_shape = {output_components, roi.h, roi.w}; + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + } + std::vector out_shape = {output_components, height, width}; out->Resize(framework::make_ddim(out_shape)); // allocate memory and assign to out_image diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cc b/paddle/fluid/operators/data/random_crop_and_resize_op.cc similarity index 90% rename from paddle/fluid/operators/random_crop_and_resize_op.cc rename to paddle/fluid/operators/data/random_crop_and_resize_op.cc index 9cb6c27d3d3e9d..55afed383c9bd0 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/random_crop_and_resize_op.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/random_crop_and_resize_op.h" +#include "paddle/fluid/operators/data/random_crop_and_resize_op.h" namespace paddle { namespace operators { +namespace data { class RandomCropAndResizeOp : public framework::OperatorWithKernel { public: @@ -113,25 +114,16 @@ class RandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -class RandomCropAndResizeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // no cpu kernel. - PADDLE_THROW(platform::errors::Unimplemented( - "RandomCropAndResize op only supports GPU now.")); - } -}; - +} // namespace data } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR( - random_crop_and_resize, ops::RandomCropAndResizeOp, - ops::RandomCropAndResizeOpMaker, + random_crop_and_resize, ops::data::RandomCropAndResizeOp, + ops::data::RandomCropAndResizeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(random_crop_and_resize, - ops::RandomCropAndResizeCPUKernel) + ops::data::RandomCropAndResizeCPUKernel) diff --git a/paddle/fluid/operators/random_crop_and_resize_op.cu b/paddle/fluid/operators/data/random_crop_and_resize_op.cu similarity index 98% rename from paddle/fluid/operators/random_crop_and_resize_op.cu rename to paddle/fluid/operators/data/random_crop_and_resize_op.cu index 9b4bb2fc30ca35..4a5f0fc635c38b 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/random_crop_and_resize_op.cu @@ -12,13 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/random_crop_and_resize_op.h" +#include "paddle/fluid/operators/data/random_crop_and_resize_op.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_launch_config.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { +namespace data { using framework::LoDTensor; using DataLayout = framework::DataLayout; @@ -350,10 +351,11 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { } }; +} // namespace data } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(random_crop_and_resize, - ops::RandomCropAndResizeCUDAKernel, - ops::RandomCropAndResizeCUDAKernel); + ops::data::RandomCropAndResizeCUDAKernel, + ops::data::RandomCropAndResizeCUDAKernel); diff --git a/paddle/fluid/operators/random_crop_and_resize_op.h b/paddle/fluid/operators/data/random_crop_and_resize_op.h similarity index 68% rename from paddle/fluid/operators/random_crop_and_resize_op.h rename to paddle/fluid/operators/data/random_crop_and_resize_op.h index 820fc18043770c..aba5b481b70d20 100644 --- a/paddle/fluid/operators/random_crop_and_resize_op.h +++ b/paddle/fluid/operators/data/random_crop_and_resize_op.h @@ -26,3 +26,22 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include #endif + +namespace paddle { +namespace operators { +namespace data { + +template +class RandomCropAndResizeCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // no cpu kernel. + PADDLE_THROW(platform::errors::Unimplemented( + "RandomCropAndResize op only supports GPU now.")); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h index 2354db019ee3be..10836e25d3eb2d 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -209,9 +209,7 @@ class FileDataReaderWrapper { std::shared_ptr reader = nullptr; void ShutDown() { - LOG(ERROR) << "FileDataReaderWrapper shutdown enter"; - reader->ShutDown(); - LOG(ERROR) << "FileDataReaderWrapper shutdown finish"; + if (reader) reader->ShutDown(); } }; diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index fccb84b10713fd..3bbf72a7e84f36 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -31,6 +31,7 @@ 'DeformConv2D', 'read_file', 'decode_jpeg', + 'image_decode', 'image_decode_random_crop', 'random_flip', 'roi_pool', @@ -40,6 +41,7 @@ 'roi_align', 'RoIAlign', 'random_crop_and_resize', + 'image_resize', ] @@ -907,7 +909,6 @@ def file_label_reader(file_root, batch_size, name=None): } helper = LayerHelper("file_label_reader", **locals()) - # out = helper.create_variable_for_type_inference('uint8') out = helper.create_variable( name=unique_name.generate("file_label_reader"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, @@ -921,6 +922,59 @@ def file_label_reader(file_root, batch_size, name=None): return out +def image_decode(x, mode='unchanged', num_threads=2, name=None): + """ + Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. + Optionally converts the image to the desired format. + The values of the output tensor are uint8 between 0 and 255. + + Args: + x (Tensor): A one dimensional uint8 tensor containing the raw bytes + of the JPEG image. + mode (str): The read mode used for optionally converting the image. + Default: 'unchanged'. + num_threads (int): parallel thread number. + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width) + + Examples: + .. code-block:: python + import cv2 + import paddle + + fake_img = (np.random.random( + (400, 300, 3)) * 255).astype('uint8') + + cv2.imwrite('fake.jpg', fake_img) + + img_bytes = paddle.vision.ops.read_file('fake.jpg') + img = paddle.vision.ops.decode_jpeg(img_bytes) + + print(img.shape) + """ + + if in_dygraph_mode(): + return _C_ops.batch_decode( + x, "mode", mode, "num_threads", num_threads) + + inputs = {'X': x} + attrs = {"mode": mode, + "num_threads": num_threads} + + helper = LayerHelper("batch_decode", **locals()) + out = helper.create_variable( + name=unique_name.generate("image_decode"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=x.dtype) + helper.append_op( + type="batch_decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) + + return out + + def image_decode_random_crop(x, mode='unchanged', num_threads=2, @@ -1538,3 +1592,86 @@ def random_crop_and_resize(x, outputs={"Out": out}, attrs=attrs) return out + + +def image_resize(x, + size, + interp_method='bilinear', + align_corners=True, + align_mode=1, + data_layout='NCHW', + seed=0, + name=None): + """ + This operator implements the paddle.vision.transforms.Resize. + + Please refer to https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/vision/transforms/Resized_cn.html#randomresizedcrop + for details. This operator has only a GPU kernel. + + Args: + x (List[Tensor]): A list of input images, 3D-Tensor with the shape + of [C,H,W] or [H,W,c]. The data type is uint8 or float32. + size (int|list|tuple): Target size of output image, + with (height, width) shape. + interp_method (str, optional): Interpolation method. Default: 'bilinear'. + support method are as following: + - "nearest", + - "bilinear" + align_corners (bool, optional): If True, the centers of 4 corner pixels + of the input and output tensors are aligned, preserving the values + at the corner pixels, If False, are not aligned. Default: True + align_mode (int32, optional): Optional for bilinear interpolation, + can be 0 for src_idx = scale*(dst_indx+0.5)-0.5, can be 1 for + src_idx = scale*dst_index. Default: 1 + data_layout (str, optional): Only used in an optional string + from: NHWC, NCHW. Specify that the data format of the input + and output data is channel_first or channel_last. Default: NCHW + seed (int, optional): The random seed. Default: 0 + name(str, optional): For detailed information, please refer to : + ref:`api_guide_Name`. Usually name is no need to set and None by + default. + + Returns: + Tensor: The output of RandomCropAndResizeOp is a 4-D tensor with shape + (batch_size, channels, h, w). The data type is uint8 or float32. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import random_crop_and_resize + + data = paddle.rand([3, 256, 256]) + out = random_crop_and_resize([data]) + """ + check_type(size, 'size', (int, tuple), 'random_crop_and_resize') + assert interp_method in ['bilinear', 'nearest'] + assert data_layout in ['NCHW', 'NHWC'] + if isinstance(size, int): + size = (size, size) + + if in_dygraph_mode(): + out = _C_ops.batch_resize( + x, "size", size, "interp_method", interp_method, + "align_corners", align_corners, "align_mode", + align_mode, "data_layout", data_layout, "seed", seed) + return out + + helper = LayerHelper('batch_resize', **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + inputs = {"X": x} + attrs = { + "size": size, + "interp_method": interp_method, + "align_corners": align_corners, + "align_mode": align_mode, + "data_layout": data_layout, + "seed": seed, + } + helper.append_op( + type="batch_resize", + inputs=inputs, + outputs={"Out": out}, + attrs=attrs) + return out From 1fc2a58d5cb6c6b980f54f7f149efca98b92911b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 5 Jan 2022 14:40:13 +0000 Subject: [PATCH 37/95] add local_rank for batch_decode --- paddle/fluid/operators/data/batch_decode_op.cc | 3 +++ paddle/fluid/operators/data/batch_decode_op.cu | 3 ++- python/paddle/vision/ops.py | 8 ++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index 6d97e4fa426c00..ea11c1da36d5e9 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -91,6 +91,9 @@ and 255. "for optionally converting the image, can be \"unchanged\" " ",\"gray\" , \"rgb\" .") .SetDefault("unchanged"); + AddAttr("local_rank", + "(int64_t)" + "The index of the op to start execution"); } }; diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index 6b906521b51d6a..c8657d3a150838 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -32,11 +32,12 @@ class GPUBatchDecodeKernel : public framework::OpKernel { int num_threads = ctx.Attr("num_threads"); LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); + auto local_rank = ctx.Attr("local_rank"); // multi-phrase decode thread pool if (!decode_pool) { LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; - decode_pool = new NvjpegDecoderThreadPool(num_threads, mode); + decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); } const framework::LoDTensorArray* inputs = diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index da2315b17c604b..9c3a8ac45d671e 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -964,13 +964,17 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): print(img.shape) """ + local_rank = paddle.distributed.get_rank() + if in_dygraph_mode(): return _C_ops.batch_decode( - x, "mode", mode, "num_threads", num_threads) + x, "mode", mode, "num_threads", num_threads, + "local_rank", local_rank) inputs = {'X': x} attrs = {"mode": mode, - "num_threads": num_threads} + "num_threads": num_threads, + "local_rank": local_rank} helper = LayerHelper("batch_decode", **locals()) out = helper.create_variable( From 6a749498f17b6612787418403101dff546a12250 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 9 Jan 2022 09:59:26 +0000 Subject: [PATCH 38/95] refine shutdown --- .../data/batch_decode_random_crop_op.cu | 2 +- paddle/fluid/operators/data/map_runner.cc | 24 ++++--------------- paddle/fluid/operators/data/map_runner.h | 8 ++++--- paddle/fluid/operators/data/nvjpeg_decoder.cc | 7 +----- paddle/fluid/operators/data/shutdown.h | 12 +++++----- paddle/fluid/operators/file_label_reader_op.h | 3 ++- 6 files changed, 19 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index dc8450c4e17588..0db44f0cc8bfb3 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -23,7 +23,7 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; -static NvjpegDecoderThreadPool* decode_pool = nullptr; +NvjpegDecoderThreadPool* decode_pool = nullptr; // std::seed_seq* rand_seq = nullptr; template diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 3d9260581f89f7..006b855aa03090 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -108,7 +108,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { } void MapRunner::StartMapThread(const Scope* scope) { - results_.emplace_back(thread_pool_.enqueue([this, scope]() -> bool { + thread_pool_.enqueue([this, scope]() -> void { auto& scope_ = scope->NewScope(); framework::Executor executor(place_); while (running_.load()) { @@ -117,7 +117,6 @@ void MapRunner::StartMapThread(const Scope* scope) { bool success = ShareInputsIntoScope(&scope_); if (!success) { ShutDown(); - return false; } // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; @@ -153,8 +152,7 @@ void MapRunner::StartMapThread(const Scope* scope) { } scope->DeleteScope(&scope_); // LOG(ERROR) << "MapThread Loop " << program_id_ << " delete scope and return"; - return true; - })); + }); } void MapRunner::CheckOutputVarStatus(const Variable &var, @@ -184,25 +182,11 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, void MapRunner::ShutDown() { VLOG(1) << "MapRunner shutdown " << program_id_; // close all output queue, op after this op can shutdown itself - LOG(ERROR) << "MapRunner ShutDown"; - for (auto queue : output_queues_) { - queue->Close(); - } - LOG(ERROR) << "MapRunner ShutDown queue closed " << program_id_; - running_.store(false); - LOG(ERROR) << "MapRunner ShutDown running false" << program_id_; - for (auto&& result: results_) { - LOG(ERROR) << "MapRunner get result " << program_id_; - result.get(); - LOG(ERROR) << "MapRunner get result finish" << program_id_; + for (auto queue : output_queues_) { + if(queue) queue->Close(); } - - // set running_ as false to exit map thread, then release thread pool - // // FIXME: ThreadPool doesn't have shutdown method - // delete &thread_pool_; - // LOG(ERROR) << "MapRunner ShutDown thread_pool_ deleted"; } // initialization static variables out of MapRunnerManager diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index ab3ce8ea7e8a0e..5d23074cc0e691 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -72,7 +72,6 @@ class MapRunner { void CheckOutputVarStatus(const Variable &var, const std::string &var_name); ThreadPool thread_pool_; - std::vector> results_; std::atomic running_; std::shared_ptr map_block_; @@ -125,18 +124,21 @@ class MapRunnerManager { void ShutDownMapRunner(int program_id) { auto iter = prog_id_to_runner_.find(program_id); if (iter != prog_id_to_runner_.end()) { + std::lock_guard lk(m_); iter->second.get()->ShutDown(); prog_id_to_runner_.erase(iter); } } void ShutDown() { + if (prog_id_to_runner_.empty()) return; + + std::lock_guard lk(m_); auto iter = prog_id_to_runner_.begin(); for (; iter != prog_id_to_runner_.end(); iter++) { - iter->second.get()->ShutDown(); + if (iter->second.get()) iter->second.get()->ShutDown(); LOG(ERROR) << "MapRunnerManager prog_id " << iter->first << " shutdown finish"; } - prog_id_to_runner_.clear(); } MapRunnerManager() { VLOG(1) << "MapRunnerManager init"; } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 56f6e288bd6a45..f9428ee5b8c7c9 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -204,8 +204,6 @@ void NvjpegDecoderThreadPool::WaitTillTasksCompleted() { } void NvjpegDecoderThreadPool::ShutDown() { - // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown enter"; - std::unique_lock lock(mutex_); running_ = false; shutdown_ = true; @@ -213,13 +211,10 @@ void NvjpegDecoderThreadPool::ShutDown() { lock.unlock(); for (auto& thread : threads_) { - // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown thread join, shutdown_ " << shutdown_; - thread.join(); - // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown thread join finish 1"; + if (thread.joinable()) thread.join(); } task_queue_.clear(); - // LOG(ERROR) << "NvjpegDecoderThreadPool ShutDown finish"; } void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 90a0eab103ede0..7680d4d530a5b9 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -34,13 +34,13 @@ void ShutDownDataLoader() { reader_wrapper.ShutDown(); LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; - // // step 2: shutdown decoder - // decode_pool->ShutDown(); - // LOG(ERROR) << "ShutDownDataLoader decode_pool shutdown finish"; + // step 2: shutdown decoder + if (decode_pool) decode_pool->ShutDown(); + LOG(ERROR) << "ShutDownDataLoader decode_pool shutdown finish"; - // // step 3: shutdown MapRunner - // MapRunnerManager::Instance()->ShutDown(); - // LOG(ERROR) << "ShutDownDataLoader MapRunner shutdown finish"; + // step 3: shutdown MapRunner + MapRunnerManager::Instance()->ShutDown(); + LOG(ERROR) << "ShutDownDataLoader MapRunner shutdown finish"; // // step 4: shutdown pipeline // PipelineManager::Instance()->ShutDown(); diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h index 0eff911712a2ac..6d828688e8ec06 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -151,6 +151,7 @@ class FileDataReader { void ShutDown() { if (queue_) queue_->Close(); + if (label_queue_) label_queue_->Close(); is_closed_.store(true); if (load_thrd_.joinable()) { @@ -238,7 +239,7 @@ class FileDataReaderWrapper { std::shared_ptr reader = nullptr; void ShutDown() { - if (reader) reader->ShutDown(); + if (reader.get()) reader->ShutDown(); } }; From ec2ff93d62ee2c73b512b8cbbfb8cb346613e065 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 9 Jan 2022 14:10:43 +0000 Subject: [PATCH 39/95] remove prefetch thread/queue in pipeline --- paddle/fluid/operators/data/dataloader_op.cc | 8 +- paddle/fluid/operators/data/dataloader_op.h | 4 +- paddle/fluid/operators/data/pipeline.cc | 109 +++++-------------- paddle/fluid/operators/data/pipeline.h | 49 ++------- paddle/fluid/operators/data/shutdown.h | 4 - 5 files changed, 44 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index 52eddb25c79f73..12b7217bf985ea 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -62,10 +62,10 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { "(int64_t)" "The unique hash id used as cache key for " "ExecutorInfoCache"); - AddAttr("prefetch_depth", - "(int64_t)" - "The prefetch batch number") - .SetDefault(2); + // AddAttr("prefetch_depth", + // "(int64_t)" + // "The prefetch batch number") + // .SetDefault(2); AddComment(R"DOC( DataLoader Op )DOC"); diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 611c720e2d1539..2198e65d8fb2e7 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -31,12 +31,10 @@ class DataLoaderOpKernel : public framework::OpKernel { auto start_op_index = ctx.Attr("start_op_index"); auto end_op_index = ctx.Attr("end_op_index"); auto program_id = ctx.Attr("program_id"); - auto prefetch_depth = - static_cast(ctx.Attr("prefetch_depth")); auto pipeline = data::PipelineManager::Instance()->GetPipeline( program_id, global_block, ctx.GetPlace(), start_op_index, end_op_index, - output_var_names, prefetch_depth); + output_var_names); pipeline->ReadNext(output_vars); LOG(ERROR) << "DataLoaderOpKernel finish"; diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 17cc9d1e121196..ed5fed3584bdd2 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -19,18 +19,14 @@ namespace data { Pipeline::Pipeline(const std::shared_ptr global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, - const std::vector &output_var_names, - size_t prefetch_queue_size) - : thread_pool_(1), - closed_(false), + const std::vector &output_var_names) + : global_block_(global_block), place_(place), start_op_index_(start_op_index), end_op_index_(end_op_index), program_id_(program_id), - output_var_names_(output_var_names), - prefetch_queue_size_(prefetch_queue_size), - prefetch_queue_(prefetch_queue_size) { + output_var_names_(output_var_names) { VLOG(1) << "Pipeline init"; PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, @@ -61,52 +57,17 @@ Pipeline::Pipeline(const std::shared_ptr global_block, } // Step3: start prefetch thread - StartPrefetchThread(parallel_executor, skip_eager_delete_vars); -} - -void Pipeline::StartPrefetchThread(std::shared_ptr executor, - const std::vector &skip_vars) { - thread_pool_.enqueue([this, executor, skip_vars]() -> void { - while (!closed_.load()) { - LOG(ERROR) << "Executor run a iter start"; - // Step1: run ops by executor without fetch - executor->RunWithoutFetch(skip_vars); - - // Step2: fetch output variable to LoDTensor vector - framework::LoDTensorArray t_arr; - t_arr.resize(output_var_names_.size()); - for (size_t i = 0; i < output_var_names_.size(); i++) { - auto *out_var = scope_.FindVar(output_var_names_[i]); - PADDLE_ENFORCE_NOT_NULL( - out_var, platform::errors::NotFound( - "The output variable %s is not found in DataLoader " - "program's internal scope", - output_var_names_[i])); - // CheckOutputVarStatus(*out_var, output_var_names_[i]); - // copy_tensor(out_var->Get(), &t_arr[i]); - auto out_queue = out_var->Get().GetQueue(); - bool success = true; - auto outputs = out_queue->Pop(&success); - PADDLE_ENFORCE_EQ(success, true, - platform::errors::PreconditionNotMet("Read from input queue failed")); - copy_tensor(outputs.at(0), &t_arr[i]); - } - - // TODO: dataset drain check - // if dataset drained: - // closed_.store(true) - // break - - // Step3: put LoDTensorArray to prefetch blocking_queue - prefetch_queue_.Push(t_arr); - LOG(ERROR) << "Executor run a iter finish"; - } - }); + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); } void Pipeline::CheckOutputVarStatus(const Variable &var, const std::string &var_name) { // only LoDTensor variable type support currently + PADDLE_ENFORCE_EQ(var.IsInitialized(), true, + platform::errors::InvalidArgument( + "The tensor in output variable %s get from DataLoader " + "program's internal scope is not initialized.", + var_name)); PADDLE_ENFORCE_EQ( var.IsType(), true, platform::errors::InvalidArgument( @@ -114,45 +75,31 @@ void Pipeline::CheckOutputVarStatus(const Variable &var, "internal scope holds wrong type. Expect type is " "LoDTensor, but receive type is %s.", var_name, platform::demangle(framework::ToTypeName(var.Type())))); - PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, - platform::errors::InvalidArgument( - "The tensor in output variable %s get from DataLoader " - "program's internal scope is not initialized.", - var_name)); } void Pipeline::ReadNext(std::vector &out_vars) { - bool ok = true; - auto vars = prefetch_queue_.Pop(&ok); - PADDLE_ENFORCE_EQ( - ok, true, platform::errors::Unavailable("Pop prefetch queue failed.")); - PADDLE_ENFORCE_EQ( - out_vars.size(), vars.size(), - platform::errors::InvalidArgument( - "Output variable number to read should be variable number " - "read from prefetch queue, but recieved %d != %d", - out_vars.size(), output_var_names_.size())); - - for (size_t i = 0; i < vars.size(); i++) { - copy_tensor(vars[i], out_vars[i]->GetMutable()); + PADDLE_ENFORCE_EQ(out_vars.size(), output_var_names_.size(), + platform::errors::InvalidArgument( + "Out variable number should equal to output variable name " + "number, but receive %d != %d", out_vars.size(), + output_var_names_.size())); + for (size_t i = 0; i < output_var_names_.size(); i++) { + auto *out_var = scope_.FindVar(output_var_names_[i]); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "The output variable %s is not found in DataLoader " + "program's internal scope", + output_var_names_[i])); + auto out_queue = out_var->Get().GetQueue(); + bool success = true; + auto outputs = out_queue->Pop(&success); + PADDLE_ENFORCE_EQ(success, true, + platform::errors::PreconditionNotMet("Read from input queue failed")); + CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); + copy_tensor(outputs.at(0), out_vars[i]->GetMutable()); } } -void Pipeline::ShutDown() { - VLOG(1) << "Pipeline close"; - closed_.store(true); - prefetch_queue_.Close(); -} - -void Pipeline::Reset() { - // (TODO)Step1: reset dataset - // - // Step2: reopen pipeline - prefetch_queue_.ReOpen(); - closed_.store(false); - // StartPrefetchThread(); -} - // initialization static variables out of PipelineManager PipelineManager *PipelineManager::pm_instance_ptr_ = nullptr; std::mutex PipelineManager::m_; diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 5d297533548258..fd472de072de5f 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -38,45 +38,25 @@ class Pipeline { Pipeline(const std::shared_ptr global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, - const std::vector &output_var_names, - size_t prefetch_queue_size); + const std::vector &output_var_names); + // size_t prefetch_queue_size); - // ~Pipeline() { - // VLOG(1) << "~Pipeline"; - // Close(); - // } - - inline size_t PrefetchCap() { return prefetch_queue_.Cap(); } - - inline size_t PrefetchSize() { return prefetch_queue_.Size(); } - - inline bool IsClosed() { return closed_; } - - inline void Close(); - - inline void Reset(); + ~Pipeline() { VLOG(1) << "~Pipeline"; } void ReadNext(std::vector &out_vars); - void ShutDown(); - private: - void copy_tensor(const framework::LoDTensor &lod_tensor, - framework::LoDTensor *out) const { + + void CheckOutputVarStatus(const Variable &var, const std::string &var_name); + + void copy_tensor(const framework::LoDTensor& lod_tensor, + framework::LoDTensor* out) const { if (lod_tensor.numel() == 0) return; - auto &out_tensor = *out; + auto& out_tensor = *out; TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } - void StartPrefetchThread(std::shared_ptr executor, - const std::vector &skip_vars); - - void CheckOutputVarStatus(const Variable &var, const std::string &var_name); - - ThreadPool thread_pool_; - std::atomic closed_; - Scope scope_; std::shared_ptr global_block_; platform::Place place_; @@ -86,8 +66,6 @@ class Pipeline { std::vector output_var_names_; - const size_t prefetch_queue_size_; - LoDTensorBlockingQueue prefetch_queue_; }; class PipelineManager { @@ -115,13 +93,12 @@ class PipelineManager { Pipeline* GetPipeline( int64_t program_id, BlockDesc *global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, - const std::vector &output_var_names, - size_t prefetch_queue_size) { + const std::vector &output_var_names) { auto iter = prog_id_to_pipeline_.find(program_id); if (iter == prog_id_to_pipeline_.end()) { prog_id_to_pipeline_[program_id] = std::unique_ptr(new Pipeline( std::shared_ptr(global_block), place, start_op_index, - end_op_index, program_id, output_var_names, prefetch_queue_size)); + end_op_index, program_id, output_var_names)); return prog_id_to_pipeline_[program_id].get(); } else { return iter->second.get(); @@ -129,10 +106,6 @@ class PipelineManager { } void ShutDown() { - auto iter = prog_id_to_pipeline_.begin(); - for (; iter != prog_id_to_pipeline_.end(); iter++) { - iter->second.get()->ShutDown(); - } prog_id_to_pipeline_.clear(); } diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 7680d4d530a5b9..5728867135b113 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -41,10 +41,6 @@ void ShutDownDataLoader() { // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); LOG(ERROR) << "ShutDownDataLoader MapRunner shutdown finish"; - - // // step 4: shutdown pipeline - // PipelineManager::Instance()->ShutDown(); - // LOG(ERROR) << "ShutDownDataLoader PipelineManager shutdown finish"; } } // namespace data From b5fef20e7b4d388ee4a5b5829325a7274ef53fc7 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 9 Jan 2022 14:48:32 +0000 Subject: [PATCH 40/95] map support multi inputs/outputs --- paddle/fluid/operators/data/map_op.cc | 18 +++++++++++------- paddle/fluid/operators/data/map_runner.h | 8 ++++---- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 41b80a38fd0858..9e8d93305b4031 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -44,15 +44,19 @@ class MapOp : public framework::OperatorBase { const platform::Place& dev_place) const override { // LOG(ERROR) << "MapOpKernel RunImpl enter"; // Step1: get output vars and attrs - // FIXME(dkp): multi input support - auto input_var = scope.FindVar(Input("In")); - auto output_var = scope.FindVar(Output("Out")); + auto inputs = Inputs("In"); std::vector input_vars; - input_vars.reserve(1); - input_vars.emplace_back(input_var); + input_vars.reserve(inputs.size()); + for (auto& input : inputs) { + input_vars.emplace_back(scope.FindVar(input)); + } + + auto outputs = Outputs("Out"); std::vector output_vars; - output_vars.reserve(1); - output_vars.emplace_back(output_var); + output_vars.reserve(outputs.size()); + for (auto& output : outputs) { + output_vars.emplace_back(scope.FindVar(output)); + } CheckInputQueueStatus(input_vars); CheckAndInitOutputQueue(output_vars, /*capacity=*/2); diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 5d23074cc0e691..e5ade5adf20d5f 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -45,10 +45,10 @@ class MapRunner { const std::vector> input_queues, const std::vector> output_queues); - // ~MapRunner() { - // VLOG(1) << "~MapRunner"; - // Close(); - // } + ~MapRunner() { + VLOG(1) << "~MapRunner"; + ShutDown(); + } void ShutDown(); From c5860c90d6f3eba2e99a791a292bcf9fe21b71a2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 10 Jan 2022 05:19:31 +0000 Subject: [PATCH 41/95] add SIGSEGV handler for map_runner --- paddle/fluid/operators/data/map_runner.cc | 19 ++++++++++++++++++- paddle/fluid/operators/data/map_runner.h | 1 - 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 006b855aa03090..b21492a647ff8e 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -9,6 +9,8 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "paddle/fluid/operators/data/map_runner.h" #include "paddle/fluid/framework/executor_cache.h" @@ -107,8 +109,19 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { return true; } +void signal_handler(int sig_num) { + VLOG(1) << "MapThread crash with signal " << sig_num; + LOG(ERROR) << "MapThread crash with signal " << sig_num; + _exit(-1); +} + void MapRunner::StartMapThread(const Scope* scope) { thread_pool_.enqueue([this, scope]() -> void { + // MapThread may crash with SIGSEGV singal in Executor::Prepare + // when Python program break and exit, catch SIGSEGV singal and + // exit thread silently + signal(SIGSEGV, signal_handler); + auto& scope_ = scope->NewScope(); framework::Executor executor(place_); while (running_.load()) { @@ -121,7 +134,11 @@ void MapRunner::StartMapThread(const Scope* scope) { // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; // Step 2: run ops by executor without fetch - executor.Run(*map_block_->Program(), &scope_, static_cast(map_block_->ID()), false, true, std::vector(), false, true); + try { + executor.Run(*map_block_->Program(), &scope_, static_cast(map_block_->ID()), false, true, std::vector(), false, true); + } catch(...) { + break; + } // LOG(ERROR) << "MapThread Loop " << program_id_ << " program run finish"; // Step 3: fetch output variable to LoDTensor vector diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index e5ade5adf20d5f..1554520ba56465 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -137,7 +137,6 @@ class MapRunnerManager { auto iter = prog_id_to_runner_.begin(); for (; iter != prog_id_to_runner_.end(); iter++) { if (iter->second.get()) iter->second.get()->ShutDown(); - LOG(ERROR) << "MapRunnerManager prog_id " << iter->first << " shutdown finish"; } } From 9e03339b97bc138d4891b75e0d477abbc1181dda Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 11 Jan 2022 15:04:54 +0000 Subject: [PATCH 42/95] add end of epoch process --- paddle/fluid/operators/data/dataloader_op.h | 8 ++++++ paddle/fluid/operators/data/map_runner.cc | 8 ++++-- paddle/fluid/operators/data/map_runner.h | 2 +- paddle/fluid/operators/data/pipeline.cc | 12 ++++++--- paddle/fluid/operators/data/pipeline.h | 8 ++++++ paddle/fluid/operators/data/shutdown.h | 1 - paddle/fluid/operators/file_label_reader_op.h | 18 ++++++++++--- python/paddle/fluid/dataloader/pipeline.py | 26 +++++++++---------- python/paddle/fluid/reader.py | 7 ++--- 9 files changed, 62 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 2198e65d8fb2e7..20d491442bdc39 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -37,6 +37,14 @@ class DataLoaderOpKernel : public framework::OpKernel { output_var_names); pipeline->ReadNext(output_vars); + + if (!pipeline->IsRunning()) { + LOG(ERROR) << "DataLoaderOpKernel Pipeline not running"; + data::PipelineManager::Instance()->ShutDownPipeline(program_id); + throw platform::EOFException("DataLoaderOpKernel epoch end", + __FILE__, __LINE__); + } + LOG(ERROR) << "DataLoaderOpKernel finish"; } }; diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index b21492a647ff8e..2c6cc0ab1ebd48 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -111,7 +111,6 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { void signal_handler(int sig_num) { VLOG(1) << "MapThread crash with signal " << sig_num; - LOG(ERROR) << "MapThread crash with signal " << sig_num; _exit(-1); } @@ -129,7 +128,12 @@ void MapRunner::StartMapThread(const Scope* scope) { // LOG(ERROR) << "MapThread Loop " << program_id_ << " start"; bool success = ShareInputsIntoScope(&scope_); if (!success) { - ShutDown(); + for(auto& queue : output_queues_) { + while(queue->Size()) sleep(0.5); + queue->Close(); + } + running_.store(false); + return; } // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 1554520ba56465..8fb3d277290c56 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -52,7 +52,7 @@ class MapRunner { void ShutDown(); - inline bool IsRunning() { return running_; } + inline bool IsRunning() { return running_.load(); } private: diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index ed5fed3584bdd2..8d4dc269fb6996 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -20,7 +20,7 @@ Pipeline::Pipeline(const std::shared_ptr global_block, const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &output_var_names) - : + : running_(true), global_block_(global_block), place_(place), start_op_index_(start_op_index), @@ -91,11 +91,17 @@ void Pipeline::ReadNext(std::vector &out_vars) { "program's internal scope", output_var_names_[i])); auto out_queue = out_var->Get().GetQueue(); + if (out_queue->IsClosed()) { + running_.store(false); + return; + } + bool success = true; auto outputs = out_queue->Pop(&success); PADDLE_ENFORCE_EQ(success, true, - platform::errors::PreconditionNotMet("Read from input queue failed")); - CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); + platform::errors::PreconditionNotMet("Read from output queue %s failed", output_var_names_[i])); + + // CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); copy_tensor(outputs.at(0), out_vars[i]->GetMutable()); } } diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index fd472de072de5f..82e69e6b30cf31 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -45,6 +45,8 @@ class Pipeline { void ReadNext(std::vector &out_vars); + inline bool IsRunning() { return running_.load(); } + private: void CheckOutputVarStatus(const Variable &var, const std::string &var_name); @@ -57,6 +59,8 @@ class Pipeline { out_tensor.set_lod(lod_tensor.lod()); } + std::atomic running_; + Scope scope_; std::shared_ptr global_block_; platform::Place place_; @@ -105,6 +109,10 @@ class PipelineManager { } } + void ShutDownPipeline(int64_t program_id) { + prog_id_to_pipeline_.erase(program_id); + } + void ShutDown() { prog_id_to_pipeline_.clear(); } diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 5728867135b113..e3dc8616a6ae91 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -16,7 +16,6 @@ #include "paddle/fluid/operators/file_label_reader_op.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" -#include "paddle/fluid/operators/data/pipeline.h" namespace paddle { diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h index 6d828688e8ec06..12cdcc2d1b5b54 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/file_label_reader_op.h @@ -106,7 +106,8 @@ class FileDataReader { batch_size_ = ctx.Attr("batch_size"); current_epoch_ = 0; current_iter_ = 0; - iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); + auto total_batch_size = batch_size_ * world_size_; + iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; is_closed_ = false; for (int i = 0, n = files.size(); i < n; i++) image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); @@ -150,8 +151,8 @@ class FileDataReader { } void ShutDown() { - if (queue_) queue_->Close(); - if (label_queue_) label_queue_->Close(); + if (queue_ && !queue_->IsClosed()) queue_->Close(); + if (label_queue_ && !label_queue_->IsClosed()) label_queue_->Close(); is_closed_.store(true); if (load_thrd_.joinable()) { @@ -177,7 +178,16 @@ class FileDataReader { ret.reserve(batch_size_); int start_index = GetStartIndex(); for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - // FIXME + if (static_cast(i) >= image_label_pairs_.size()) { + // FIXME(dkp): refine close pipeline + while (queue_->Size()) sleep(0.5); + queue_->Close(); + while (label_queue_->Size()) sleep(0.5); + label_queue_->Close(); + + is_closed_.store(true); + break; + } i %= image_label_pairs_.size(); framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); ret.push_back(std::move(tmp)); diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index dc8c12de661b41..89eb8a773e813d 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -111,18 +111,16 @@ def __next__(self): "Pipeline not built, please call build() firstly" self._output_vars = self._prepare_output_vars() - # try: - import sys - import time - tic = time.time() - _C_ops.dataloader(self._output_vars, *self._attrs) - toc = time.time() - print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) - sys.stdout.flush() - # except: - # print("_C_ops dataloader except enter") - # sys.stdout.flush() - # core._shutdown_dataloader() + try: + import sys + import time + tic = time.time() + _C_ops.dataloader(self._output_vars, *self._attrs) + toc = time.time() + print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) + sys.stdout.flush() + except: + raise StopIteration return {k: v for k, v in zip(self._out_names, self._output_vars)} @@ -130,5 +128,5 @@ def __next__(self): def next(self): return self.__next__() - def __del__(self): - core._shutdown_dataloader() + # def __del__(self): + # core._shutdown_dataloader() diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index bdbf75acfff625..7fff15cc9807c6 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -437,9 +437,10 @@ def __call__(self): return self.__iter__() @staticmethod - def from_pipeline(pipeline): - assert isinstance(pipeline, Pipeline), \ - "pipeline should be an instance of paddle.io.Pipeline" + def build_pipeline(func, *args, **kwargs): + with Pipeline() as pipeline: + outputs = func(*args, **kwargs) + pipeline.set_outputs(outputs) pipeline.build() return pipeline From bd93dabf9af91e76858a7dc7ad9377a24e3629fc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 12 Jan 2022 06:14:40 +0000 Subject: [PATCH 43/95] fix nvjpeg hw bug --- paddle/fluid/operators/data/nvjpeg_decoder.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index f9428ee5b8c7c9..17d4c504ab48d3 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -122,7 +122,10 @@ void NvjpegDecoder::ParseDecodeParams( roi_generator->GenerateRandomROI(width, height, &roi); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + height = roi.h; + width = roi.w; } + std::vector out_shape = {output_components, height, width}; out->Resize(framework::make_ddim(out_shape)); From 29f670bad44cd6e86d76c1c960d9bfc7a1d623c2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 12 Jan 2022 12:22:33 +0000 Subject: [PATCH 44/95] use NVJPEG_OUTPUT_RGBI --- paddle/fluid/operators/data/nvjpeg_decoder.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 17d4c504ab48d3..931fb1411085cf 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -98,7 +98,7 @@ void NvjpegDecoder::ParseDecodeParams( output_format = NVJPEG_OUTPUT_Y; output_components = 1; } else if (components == 3) { - output_format = NVJPEG_OUTPUT_RGB; + output_format = NVJPEG_OUTPUT_RGBI; output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( @@ -108,7 +108,7 @@ void NvjpegDecoder::ParseDecodeParams( output_format = NVJPEG_OUTPUT_Y; output_components = 1; } else if (mode_ == "rgb") { - output_format = NVJPEG_OUTPUT_RGB; + output_format = NVJPEG_OUTPUT_RGBI; output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( @@ -131,10 +131,8 @@ void NvjpegDecoder::ParseDecodeParams( // allocate memory and assign to out_image auto* data = out->mutable_data(place); - for (int c = 0; c < output_components; c++) { - out_image->channel[c] = data + c * width * height; - out_image->pitch[c] = width; - } + out_image->channel[0] = data; + out_image->pitch[0] = output_components * width; } void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { From 84684b5d98f9fa4c2c51d62bfe52be83baf88da9 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Thu, 13 Jan 2022 05:42:41 +0000 Subject: [PATCH 45/95] add reader manager --- paddle/fluid/operators/data/CMakeLists.txt | 1 + paddle/fluid/operators/data/nvjpeg_decoder.cc | 2 + paddle/fluid/operators/data/shutdown.h | 9 +- .../fluid/operators/file_label_reader_op.cc | 151 ----------- paddle/fluid/operators/file_label_reader_op.h | 248 ------------------ python/paddle/vision/ops.py | 9 +- 6 files changed, 15 insertions(+), 405 deletions(-) delete mode 100644 paddle/fluid/operators/file_label_reader_op.cc delete mode 100644 paddle/fluid/operators/file_label_reader_op.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 5f24e27d2ef866..0af5a7d2334acf 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -17,6 +17,7 @@ op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpe op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) +op_library(file_label_reader_op SRCS file_label_reader_op.cc DEPS ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index f9428ee5b8c7c9..c143b590db02c8 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -122,6 +122,8 @@ void NvjpegDecoder::ParseDecodeParams( roi_generator->GenerateRandomROI(width, height, &roi); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + height = roi.h; + width = roi.w; } std::vector out_shape = {output_components, height, width}; out->Resize(framework::make_ddim(out_shape)); diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 5728867135b113..8b21e7cb19d3b2 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/fluid/operators/file_label_reader_op.h" +#include "paddle/fluid/operators/data/file_label_reader_op.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" #include "paddle/fluid/operators/data/pipeline.h" @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -extern FileDataReaderWrapper reader_wrapper; +// extern FileDataReaderWrapper reader_wrapper; namespace data { @@ -31,9 +31,10 @@ extern NvjpegDecoderThreadPool* decode_pool; void ShutDownDataLoader() { LOG(ERROR) << "ShutDownDataLoader enter"; // step 1: shutdown reader - reader_wrapper.ShutDown(); + // reader_wrapper.ShutDown(); + ReaderManager::Instance()->ShutDown(); LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; - + // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); LOG(ERROR) << "ShutDownDataLoader decode_pool shutdown finish"; diff --git a/paddle/fluid/operators/file_label_reader_op.cc b/paddle/fluid/operators/file_label_reader_op.cc deleted file mode 100644 index a539d015390e08..00000000000000 --- a/paddle/fluid/operators/file_label_reader_op.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/file_label_reader_op.h" - -namespace paddle { -namespace operators { - -FileDataReaderWrapper reader_wrapper; - -template -class CPUFileLabelKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override {} -}; - -class FileLabelReaderOp : public framework::OperatorBase { - public: - // using framework::OperatorWithKernel::OperatorWithKernel; - FileLabelReaderOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ReadFileOp is null.")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - return framework::OpKernelType(framework::proto::VarType::UINT8, - platform::CPUPlace()); - } - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - LOG(ERROR) << "FileLabelReaderOp RunImpl start"; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& dev_ctx = *pool.Get(dev_place); - framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); - framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); - - auto* out = scope.FindVar(Output("Out")); - auto out_queue = out->Get().GetQueue(); - if (out_queue == nullptr) { - LOG(ERROR) << "FileLabelReaderOp init output queue"; - auto* holder = out->template GetMutable(); - holder->InitOnce(2); - out_queue = holder->GetQueue(); - } - - auto* out_label = scope.FindVar(Output("Label")); - auto out_label_queue = - out_label->Get().GetQueue(); - if (out_label_queue == nullptr) { - LOG(ERROR) << "FileLabelReaderOp init output label queue"; - auto* label_holder = - out_label->template GetMutable(); - label_holder->InitOnce(2); - out_label_queue = label_holder->GetQueue(); - } - - if (reader_wrapper.reader == nullptr) { - // create reader - reader_wrapper.SetUp(ctx, out_queue.get(), out_label_queue.get()); - } - // LoDTensorArray samples = reader_wrapper.reader->Next(); - // framework::LoDTensorArray out_array; - // out_array.resize(samples.size()); - // for (size_t i = 0; i < samples.size(); ++i) { - // copy_tensor(samples[i], &out_array[i]); - // } - // out_queue->Push(out_array); - LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; - } - - void copy_tensor(const framework::LoDTensor& lod_tensor, - framework::LoDTensor* out) const { - if (lod_tensor.numel() == 0) return; - auto& out_tensor = *out; - TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); - out_tensor.set_lod(lod_tensor.lod()); - } - - // std::shared_ptr reader=nullptr; -}; - -class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddOutput("Out", "The output tensor of ReadFile op"); - AddOutput("Label", "The output tensor of ReadFile op"); - AddComment(R"DOC( -This operator read a file. -)DOC"); - AddAttr("root_dir", "Path of the file to be readed.") - .SetDefault(""); - AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); - AddAttr("rank", "Path of the file to be readed.").SetDefault(0); - AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); - AddAttr>("files", "Path of the file to be readed.") - .SetDefault({}); - AddAttr>("labels", "Path of the file to be readed.") - .SetDefault({}); - } -}; - -class FileLabelReaderInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* context) const override { - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", - "FileLabelReader"); - } -}; - -class FileLabelReaderInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR( - file_label_reader, ops::FileLabelReaderOp, ops::FileLabelReaderOpMaker, - ops::FileLabelReaderInferShape, ops::FileLabelReaderInferVarType, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker) - -REGISTER_OP_CPU_KERNEL(file_label_reader, ops::CPUFileLabelKernel) diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/file_label_reader_op.h deleted file mode 100644 index 6d828688e8ec06..00000000000000 --- a/paddle/fluid/operators/file_label_reader_op.h +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" - -namespace paddle { -namespace operators { - -using LoDTensorArray = framework::LoDTensorArray; -using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; - -enum BufferStatus { - kBufferStatusSuccess = 0, - kBufferStatusErrorClosed, - kBufferStatusEmpty -}; - -template -class Buffer final { - public: - explicit Buffer(size_t max_len = 2) : max_len_(max_len), is_closed_(false) {} - ~Buffer() = default; - - BufferStatus Push(const T& item); - BufferStatus Pull(T* item); - BufferStatus TryReceive(T* item); - void Close(); - - private: - std::queue queue_; - mutable std::mutex mutex_; - size_t max_len_; - bool is_closed_; - std::condition_variable cond_; -}; - -template -BufferStatus Buffer::Push(const T& item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return queue_.size() < max_len_ || is_closed_; }); - if (is_closed_) { - return kBufferStatusErrorClosed; - } - - queue_.push(item); - cond_.notify_one(); - return kBufferStatusSuccess; -} - -template -BufferStatus Buffer::Pull(T* item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return (!queue_.empty()) || is_closed_; }); - if (queue_.empty()) { - return kBufferStatusErrorClosed; - } - *item = queue_.front(); - queue_.pop(); - if (queue_.size() < max_len_) { - cond_.notify_all(); - } - return kBufferStatusSuccess; -} - -template -void Buffer::Close() { - std::unique_lock lock(mutex_); - is_closed_ = true; - cond_.notify_all(); -} - -class FileDataReader { - public: - explicit FileDataReader(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) - : queue_(queue), label_queue_(label_queue){ - std::vector files = - ctx.Attr>("files"); - std::vector labels = ctx.Attr>("labels"); - rank_ = ctx.Attr("rank"); - world_size_ = ctx.Attr("world_size"); - // std::cout << "files and labels size: " << files.size() << " " - // << labels.size() << std::endl; - batch_size_ = ctx.Attr("batch_size"); - current_epoch_ = 0; - current_iter_ = 0; - iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); - is_closed_ = false; - for (int i = 0, n = files.size(); i < n; i++) - image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); - StartLoadThread(); - } - - int GetStartIndex() { - int start_idx = - batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + - rank_ * batch_size_; - current_iter_++; - return start_idx; - } - - framework::LoDTensor ReadSample(const std::string filename) { - std::ifstream input(filename.c_str(), - std::ios::in | std::ios::binary | std::ios::ate); - std::streamsize file_size = input.tellg(); - - input.seekg(0, std::ios::beg); - - // auto* out = ctx.Output("Out"); - framework::LoDTensor out; - std::vector out_shape = {file_size}; - out.Resize(framework::make_ddim(out_shape)); - - uint8_t* data = out.mutable_data(platform::CPUPlace()); - - input.read(reinterpret_cast(data), file_size); - return out; - } - - void StartLoadThread() { - if (load_thrd_.joinable()) { - return; - } - - load_thrd_ = std::thread([this] { - while (!is_closed_.load()) LoadBatch(); - }); - } - - void ShutDown() { - if (queue_) queue_->Close(); - if (label_queue_) label_queue_->Close(); - - is_closed_.store(true); - if (load_thrd_.joinable()) { - load_thrd_.join(); - } - } - - // LoDTensorArray Read() { - // LoDTensorArray ret; - // ret.reserve(batch_size_); - // int start_index = GetStartIndex(); - // for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - // // FIXME - // i %= image_label_pairs_.size(); - // framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); - // ret.push_back(std::move(tmp)); - // } - // return ret; - // } - std::pair> Read() { - LoDTensorArray ret; - std::vector label; - ret.reserve(batch_size_); - int start_index = GetStartIndex(); - for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - // FIXME - i %= image_label_pairs_.size(); - framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); - ret.push_back(std::move(tmp)); - label.push_back(image_label_pairs_[i].second); - } - return std::make_pair(ret, label); - } - - // LoDTensorArray Next() { - // LoDTensorArray batch_data; - // batch_buffer_.Pull(&batch_data); - // return batch_data; - // } - // - void LoadBatch() { - // std::cout << "start LoadBatch 0.01" << std::endl; - // LoDTensorArray batch_data = std::move(Read()); - // queue_->Push(batch_data); - - auto batch_data = std::move(Read()); - queue_->Push(batch_data.first); - framework::LoDTensor label_tensor; - LoDTensorArray label_array; - // auto& label_tensor = label.GetMutable(); - label_tensor.Resize( - framework::make_ddim({static_cast(batch_data.first.size())})); - platform::CPUPlace cpu; - auto* label_data = label_tensor.mutable_data(cpu); - for (size_t i = 0; i < batch_data.first.size(); ++i) { - label_data[i] = batch_data.second[i]; - } - label_array.push_back(label_tensor); - label_queue_->Push(label_array); - } - - private: - int batch_size_; - std::string file_root_, file_list_; - std::vector> image_label_pairs_; - int current_epoch_; - int current_iter_; - int rank_; - int world_size_; - int iters_per_epoch_; - std::atomic is_closed_; - Buffer batch_buffer_; - std::thread load_thrd_; - LoDTensorBlockingQueue* queue_; - LoDTensorBlockingQueue* label_queue_; -}; - -class FileDataReaderWrapper { - public: - void SetUp(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { - reader.reset(new FileDataReader(ctx, queue, label_queue)); - } - - std::shared_ptr reader = nullptr; - - void ShutDown() { - if (reader.get()) reader->ShutDown(); - } -}; - - -} // namespace operators -} // namespace paddle diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 9c3a8ac45d671e..bb58eb43f99588 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -895,17 +895,22 @@ def file_label_reader(file_root, batch_size, name=None): samples = [s[0] for s in data_folder.samples] targets = [s[1] for s in data_folder.samples] + import time + unq_reader_id = lambda : int(round(time.time()* 1000*1000)) + + if in_dygraph_mode(): return _C_ops.file_label_reader('root_dir', file_root, 'batch_size', batch_size, 'files', samples, 'labels', - targets) + targets, 'reader_id', unq_reader_id) inputs = dict() attrs = { 'root_dir': file_root, 'batch_size': batch_size, 'files': samples, - 'labels': targets + 'labels': targets, + 'reader_id': unq_reader_id, } helper = LayerHelper("file_label_reader", **locals()) From 47aa4fcbba70adc5f75f19a9e69e920f816f1857 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Thu, 13 Jan 2022 07:19:20 +0000 Subject: [PATCH 46/95] mv file label reader to data/ , add reader manager --- .../operators/data/file_label_reader_op.cc | 162 ++++++++++++++++++ .../{ => data}/file_label_reader_op.h | 84 +++++---- 2 files changed, 215 insertions(+), 31 deletions(-) create mode 100644 paddle/fluid/operators/data/file_label_reader_op.cc rename paddle/fluid/operators/{ => data}/file_label_reader_op.h (82%) diff --git a/paddle/fluid/operators/data/file_label_reader_op.cc b/paddle/fluid/operators/data/file_label_reader_op.cc new file mode 100644 index 00000000000000..d6648ed86e4099 --- /dev/null +++ b/paddle/fluid/operators/data/file_label_reader_op.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/file_label_reader_op.h" + +namespace paddle { +namespace operators { +namespace data { +// FileDataReaderWrapper reader_wrapper; + +// initialization static variables out of ReaderManager +ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; +std::mutex ReaderManager::m_; + +template +class CPUFileLabelKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +class FileLabelReaderOp : public framework::OperatorBase { + public: + // using framework::OperatorWithKernel::OperatorWithKernel; + FileLabelReaderOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of ReadFileOp is null.")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + LOG(ERROR) << "FileLabelReaderOp RunImpl start"; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); + + auto* out = scope.FindVar(Output("Out")); + auto out_queue = out->Get().GetQueue(); + if (out_queue == nullptr) { + LOG(ERROR) << "FileLabelReaderOp init output queue"; + auto* holder = out->template GetMutable(); + holder->InitOnce(2); + out_queue = holder->GetQueue(); + } + + auto* out_label = scope.FindVar(Output("Label")); + auto out_label_queue = + out_label->Get().GetQueue(); + if (out_label_queue == nullptr) { + LOG(ERROR) << "FileLabelReaderOp init output label queue"; + auto* label_holder = + out_label->template GetMutable(); + label_holder->InitOnce(2); + out_label_queue = label_holder->GetQueue(); + } + + // if (reader_wrapper.reader == nullptr) { + // // create reader + // reader_wrapper.SetUp(ctx, out_queue.get(), out_label_queue.get()); + // } + ReaderManager::Instance()->GetReader( + 0, ctx, out_queue.get(), out_label_queue.get()); + // LoDTensorArray samples = reader_wrapper.reader->Next(); + // framework::LoDTensorArray out_array; + // out_array.resize(samples.size()); + // for (size_t i = 0; i < samples.size(); ++i) { + // copy_tensor(samples[i], &out_array[i]); + // } + // out_queue->Push(out_array); + LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; + } + + void copy_tensor(const framework::LoDTensor& lod_tensor, + framework::LoDTensor* out) const { + if (lod_tensor.numel() == 0) return; + auto& out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + // std::shared_ptr reader=nullptr; +}; + +class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "The output tensor of ReadFile op"); + AddOutput("Label", "The output tensor of ReadFile op"); + AddComment(R"DOC( +This operator read a file. +)DOC"); + AddAttr("root_dir", "Path of the file to be readed.") + .SetDefault(""); + AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); + AddAttr("rank", "Path of the file to be readed.").SetDefault(0); + AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); + AddAttr("reader_id", + "(int64_t)" + "The unique hash id used as cache key for " + "ExecutorInfoCache").SetDefault(0);; + AddAttr>("files", "Path of the file to be readed.") + .SetDefault({}); + AddAttr>("labels", "Path of the file to be readed.") + .SetDefault({}); + } +}; + +class FileLabelReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", + "FileLabelReader"); + } +}; + +class FileLabelReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, + framework::ALL_ELEMENTS); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::data; + +REGISTER_OPERATOR( + file_label_reader, ops::FileLabelReaderOp, ops::FileLabelReaderOpMaker, + ops::FileLabelReaderInferShape, ops::FileLabelReaderInferVarType, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(file_label_reader, ops::CPUFileLabelKernel) diff --git a/paddle/fluid/operators/file_label_reader_op.h b/paddle/fluid/operators/data/file_label_reader_op.h similarity index 82% rename from paddle/fluid/operators/file_label_reader_op.h rename to paddle/fluid/operators/data/file_label_reader_op.h index 12cdcc2d1b5b54..a0e70455ce8663 100644 --- a/paddle/fluid/operators/file_label_reader_op.h +++ b/paddle/fluid/operators/data/file_label_reader_op.h @@ -26,7 +26,7 @@ namespace paddle { namespace operators { - +namespace data { using LoDTensorArray = framework::LoDTensorArray; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; @@ -101,11 +101,11 @@ class FileDataReader { std::vector labels = ctx.Attr>("labels"); rank_ = ctx.Attr("rank"); world_size_ = ctx.Attr("world_size"); - // std::cout << "files and labels size: " << files.size() << " " - // << labels.size() << std::endl; + batch_size_ = ctx.Attr("batch_size"); current_epoch_ = 0; current_iter_ = 0; + // iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); auto total_batch_size = batch_size_ * world_size_; iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; is_closed_ = false; @@ -160,18 +160,7 @@ class FileDataReader { } } - // LoDTensorArray Read() { - // LoDTensorArray ret; - // ret.reserve(batch_size_); - // int start_index = GetStartIndex(); - // for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - // // FIXME - // i %= image_label_pairs_.size(); - // framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); - // ret.push_back(std::move(tmp)); - // } - // return ret; - // } + std::pair> Read() { LoDTensorArray ret; std::vector label; @@ -196,16 +185,8 @@ class FileDataReader { return std::make_pair(ret, label); } - // LoDTensorArray Next() { - // LoDTensorArray batch_data; - // batch_buffer_.Pull(&batch_data); - // return batch_data; - // } - // + void LoadBatch() { - // std::cout << "start LoadBatch 0.01" << std::endl; - // LoDTensorArray batch_data = std::move(Read()); - // queue_->Push(batch_data); auto batch_data = std::move(Read()); queue_->Push(batch_data.first); @@ -239,20 +220,61 @@ class FileDataReader { LoDTensorBlockingQueue* label_queue_; }; -class FileDataReaderWrapper { + +class ReaderManager { + // PipelineManager is a signleton manager for Pipeline, we + // create single Pipeline for a program id + private: + DISABLE_COPY_AND_ASSIGN(ReaderManager); + + static ReaderManager *rm_instance_ptr_; + static std::mutex m_; + + std::map> prog_id_to_reader_; + public: - void SetUp(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { - reader.reset(new FileDataReader(ctx, queue, label_queue)); + static ReaderManager *Instance() { + if (rm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); + if (rm_instance_ptr_ == nullptr) { + rm_instance_ptr_ = new ReaderManager; + } + } + return rm_instance_ptr_; } - std::shared_ptr reader = nullptr; + // FileDataReader* GetReader( + void GetReader( + int64_t program_id, const framework::ExecutionContext& ctx, + LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { + auto iter = prog_id_to_reader_.find(program_id); + if (iter == prog_id_to_reader_.end()) { + prog_id_to_reader_[program_id] = std::unique_ptr(new FileDataReader(ctx, queue, label_queue)); + // return prog_id_to_reader_[program_id].get(); + } else { + // return iter->second.get(); + } + } void ShutDown() { - if (reader.get()) reader->ShutDown(); + auto iter = prog_id_to_reader_.begin(); + while (iter != prog_id_to_reader_.end()){ + if(iter->second.get()){ + iter->second->ShutDown(); + } + iter++; + } + prog_id_to_reader_.clear(); } -}; + ReaderManager() { VLOG(1) << "ReaderManager init"; } + + ~ReaderManager() { + VLOG(1) << "~ReaderManager"; + ShutDown(); + } +}; +} // namespace data } // namespace operators } // namespace paddle From 2e1a4dbe0b6b8abd26e9016db023aafd15313276 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 19 Jan 2022 14:25:23 +0000 Subject: [PATCH 47/95] add loader & data_reader, compile success --- .../fluid/framework/ir/data_io_queue_pass.cc | 1 + paddle/fluid/operators/data/CMakeLists.txt | 4 +- .../fluid/operators/data/batch_decode_op.cc | 16 - paddle/fluid/operators/data/data_reader_op.cc | 119 +++++++ paddle/fluid/operators/data/data_reader_op.h | 308 ++++++++++++++++++ .../operators/data/file_label_loader_op.cc | 162 +++++++++ .../operators/data/file_label_loader_op.h | 308 ++++++++++++++++++ .../operators/data/file_label_reader_op.cc | 162 --------- .../operators/data/file_label_reader_op.h | 280 ---------------- paddle/fluid/operators/data/map_op.cc | 1 - paddle/fluid/operators/data/map_runner.h | 1 - paddle/fluid/operators/data/nvjpeg_decoder.cc | 2 +- paddle/fluid/operators/data/shutdown.h | 10 +- python/paddle/fluid/dataloader/ops.py | 58 +++- python/paddle/vision/ops.py | 47 +++ 15 files changed, 1010 insertions(+), 469 deletions(-) create mode 100644 paddle/fluid/operators/data/data_reader_op.cc create mode 100644 paddle/fluid/operators/data/data_reader_op.h create mode 100644 paddle/fluid/operators/data/file_label_loader_op.cc create mode 100644 paddle/fluid/operators/data/file_label_loader_op.h delete mode 100644 paddle/fluid/operators/data/file_label_reader_op.cc delete mode 100644 paddle/fluid/operators/data/file_label_reader_op.h diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/data_io_queue_pass.cc index 490020c25bdb2c..8d9769da9ec681 100644 --- a/paddle/fluid/framework/ir/data_io_queue_pass.cc +++ b/paddle/fluid/framework/ir/data_io_queue_pass.cc @@ -27,6 +27,7 @@ class Graph; std::set output_queue_holder_ops = { "file_label_reader", "map", + "data_reader", }; std::set input_array_ops = { diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 0af5a7d2334acf..0db8f408a151de 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -7,6 +7,8 @@ endif() cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope) op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) +op_library(data_reader_op SRCS data_reader_op.cc DEPS ${OP_HEADER_DEPS}) + cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) @@ -17,7 +19,7 @@ op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpe op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) -op_library(file_label_reader_op SRCS file_label_reader_op.cc DEPS ${OP_HEADER_DEPS}) +op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index ea11c1da36d5e9..d7c39f0aaf1c4d 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -25,22 +25,6 @@ class BatchDecodeOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); - - // auto mode = ctx->Attrs().Get("mode"); - // std::vector out_dims; - // - // if (mode == "unchanged") { - // out_dims = {-1, -1, -1}; - // } else if (mode == "gray") { - // out_dims = {1, -1, -1}; - // } else if (mode == "rgb") { - // out_dims = {3, -1, -1}; - // } else { - // PADDLE_THROW(platform::errors::Fatal( - // "The provided mode is not supported for JPEG files on GPU: ", mode)); - // } - // - // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); } protected: diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc new file mode 100644 index 00000000000000..955e12e28be522 --- /dev/null +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/data_reader_op.h" + +namespace paddle { +namespace operators { +namespace data { + +// initialization static variables out of ReaderManager +ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; +std::mutex ReaderManager::m_; + +class DataReaderOp : public framework::OperatorBase { + public: + DataReaderOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "DataReaderOp"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto outputs = Outputs("Out"); + std::vector output_vars; + output_vars.reserve(outputs.size()); + for (auto& output : outputs) { + output_vars.emplace_back(scope.FindVar(output)); + } + + CheckAndInitOutputQueue(output_vars, /*capacity=*/2); + + auto batch_size = Attr("batch_size"); + auto num_samples = Attr("num_samples"); + auto shuffle = Attr("shuffle"); + auto drop_last = Attr("drop_last"); + auto rank = Attr("rank"); + auto world_size = Attr("world_size"); + auto indices_var_name = Attr("indices_var_name"); + auto output_var_names = Attr>("output_var_names"); + auto* reader_block = Attr("reader_block"); + auto reader_id = Attr("reader_id"); + + auto output_queues = GetQueueVecFromVariableVec(output_vars); + ReaderManager::Instance()->StartDataReader( + reader_id, reader_block, &scope, dev_place, indices_var_name, + output_var_names, output_queues, batch_size, num_samples, + shuffle, drop_last, rank, world_size); + } +}; + +class DataReaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "The output queue variable of DataReader op") + .AsDuplicable(); + AddAttr("batch_size", "The batch size for reading samples") + .SetDefault(1); + AddAttr("num_samples", "The sample number in dataset"); + AddAttr("shuffle", "Whether shuffle the dataset") + .SetDefault(false); + AddAttr("drop_last", "Whether drop last incomplete batch") + .SetDefault(false); + AddAttr("rank", "The logical rank of current device.") + .SetDefault(0); + AddAttr("world_size", "The number of running devices.") + .SetDefault(1); + AddAttr("reader_id", "The unique id to generate and get reader"); + AddAttr("reader_block", + "(BlockDesc *)" + "The global block of executed reader program " + "desc."); + AddAttr("indices_var_name", + "(string)" + "input variable names for sample indices"); + AddAttr>("output_var_names", + "(list of string)" + "output variable names for reader program"); + AddComment(R"DOC( + This operator read a file. +)DOC"); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::data; + +REGISTER_OPERATOR( + data_reader, ops::DataReaderOp, ops::DataReaderOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(data_reader, ops::DataReaderCPUKernel) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h new file mode 100644 index 00000000000000..2bdd2a9ff8085e --- /dev/null +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -0,0 +1,308 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace data { + +using Scope = framework::Scope; +using Variable = framework::Variable; +using BlockDesc = framework::BlockDesc; +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; +using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; +using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; + +class Sampler { + public: + explicit Sampler(const int64_t batch_size, const int64_t num_samples, + const bool shuffle, const bool drop_last, + const int rank, const int world_size) + : current_iter_(0), + batch_size_(batch_size), + num_samples_(num_samples), + drop_last_(drop_last), + rank_(rank), + world_size_(world_size) { + sample_ids_.reserve(num_samples); + for (int64_t i = 0; i < num_samples; i++) { + sample_ids_.emplace_back(i); + } + if (shuffle) { + rnd_.seed(time(0)); + std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); + } + } + + void GetNextIndices(std::vector* indices) { + int64_t start_idx = + batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; + current_iter_++; + + if (start_idx >= num_samples_) return; + if (drop_last_ && start_idx + batch_size_ >= num_samples_) return; + + int64_t batch_len = std::min(batch_size_, num_samples_ - start_idx); + indices->reserve(batch_len); + for (int64_t i = 0; i < batch_len; i++) { + indices->emplace_back(sample_ids_[start_idx + i]); + } + } + + private: + int64_t current_iter_; + const int64_t batch_size_; + const int64_t num_samples_; + const bool drop_last_; + const int rank_; + const int world_size_; + + std::mt19937 rnd_; + std::vector sample_ids_; +}; + +class DataReader { + public: + explicit DataReader(BlockDesc* reader_block, + const Scope* scope, + const platform::Place place, + const std::string &indices_var_name, + const std::vector &output_var_names, + const std::vector> output_queues, + const int batch_size, + const int num_samples, + const bool shuffle, + const bool drop_last, + const int rank, + const int world_size) + : running_(true), + reader_block_(reader_block), + place_(place), + indices_var_name_(indices_var_name), + output_var_names_(output_var_names), + output_queues_(output_queues), + batch_size_(batch_size), + sampler_(batch_size, num_samples, shuffle, + drop_last, rank, world_size) { + StartReaderThread(scope); + } + + void StartReaderThread(const Scope* scope) { + if (reader_thread_.joinable()) { + return; + } + + reader_thread_ = std::thread([this, scope] { + auto& scope_ = scope->NewScope(); + framework::Executor executor(place_); + while (running_.load()) { + std::vector indices; + sampler_.GetNextIndices(&indices); + // shutdown reader if indices drained + if (indices.size() == 0) ShutDown(); + + ShareIndicesIntoScope(&scope_, indices); + + try { + executor.Run(*reader_block_->Program(), &scope_, + static_cast(reader_block_->ID()), + false, true, std::vector(), + false, true); + } catch (...) { + break; + } + + for (size_t i = 0; i < output_var_names_.size(); i++) { + auto *out_var = scope_.FindVar(output_var_names_[i]); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "The output variable %s is not found in DataReader " + "program's internal scope", output_var_names_[i])); + // CheckOutputVarStatus(*out_var, output_var_names_[i]); + + if (out_var->IsType()) { + framework::LoDTensorArray t_arr(1); + copy_tensor(out_var->Get(), &t_arr[0]); + output_queues_[i]->Push(t_arr); + } else { + auto out_arr = out_var->Get(); + framework::LoDTensorArray t_arr(out_arr.size()); + for (size_t i = 0; i < out_arr.size(); i++) { + copy_tensor(out_arr[i], &t_arr[i]); + } + output_queues_[i]->Push(t_arr); + } + } + } + scope->DeleteScope(&scope_); + }); + } + + void ShutDown() { + for(auto& queue: output_queues_) { + while (queue->Size()) sleep(0.5); + queue->Close(); + } + + running_.store(false); + if (reader_thread_.joinable()) reader_thread_.join(); + } + + void ShareIndicesIntoScope(Scope* scope, + std::vector indices) { + // get indices variable from scope + auto* var = scope->Var(indices_var_name_); + + auto* indices_tensor = var->GetMutable(); + indices_tensor->Resize(framework::make_ddim({batch_size_})); + auto* indices_data = indices_tensor->mutable_data(place_); + + for (size_t i = 0; i < indices.size(); i++) { + indices_data[i] = indices[i]; + } + } + + private: + std::atomic running_; + std::thread reader_thread_; + + BlockDesc* reader_block_; + platform::Place place_; + + std::string indices_var_name_; + std::vector output_var_names_; + std::vector> output_queues_; + + const int64_t batch_size_; + Sampler sampler_; + + void copy_tensor(const framework::LoDTensor& lod_tensor, + framework::LoDTensor* out) const { + if (lod_tensor.numel() == 0) return; + auto& out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } +}; + + +class ReaderManager { + private: + DISABLE_COPY_AND_ASSIGN(ReaderManager); + + static ReaderManager *rm_instance_ptr_; + static std::mutex m_; + + std::map> id_to_reader_; + + public: + static ReaderManager *Instance() { + if (rm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); + if (rm_instance_ptr_ == nullptr) { + rm_instance_ptr_ = new ReaderManager; + } + } + return rm_instance_ptr_; + } + + void StartDataReader( + const int64_t reader_id, BlockDesc *reader_block, + const Scope* scope, const platform::Place place, + const std::string &indices_var_name, + const std::vector &output_var_names, + const std::vector> &output_queues, + const int batch_size, const int num_samples, const bool shuffle, + const bool drop_last, const int rank, const int world_size) { + auto iter = id_to_reader_.find(reader_id); + if (iter == id_to_reader_.end()) { + id_to_reader_[reader_id] = std::unique_ptr( + new DataReader(reader_block, scope, place, indices_var_name, + output_var_names, output_queues, batch_size, + num_samples, shuffle, drop_last, rank, world_size)); + } + } + + void ShutDown() { + auto iter = id_to_reader_.begin(); + while (iter != id_to_reader_.end()){ + if(iter->second.get()){ + iter->second->ShutDown(); + } + iter++; + } + id_to_reader_.clear(); + } + + ReaderManager() { VLOG(1) << "ReaderManager init"; } + + ~ReaderManager() { + VLOG(1) << "~ReaderManager"; + ShutDown(); + } +}; + +static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { + for (auto var : vars) { + if (var->IsInitialized()) { + PADDLE_ENFORCE_EQ(var->IsType(), true, + platform::errors::InvalidArgument( + "Output Variables of MapOp should hold " + "LoDTensorBlockingQueueHolder type")); + auto queue = var->Get().GetQueue(); + if (queue == nullptr) { + auto* holder = var->template GetMutable(); + holder->InitOnce(capacity); + LOG(ERROR) << "MapOpKernel init queue" << holder->GetQueue(); + } + } else { + VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; + auto* holder = var->GetMutable(); + holder->InitOnce(capacity); + } + } +} + +static std::vector> GetQueueVecFromVariableVec(const std::vector& vars) { + std::vector> queues; + queues.reserve(vars.size()); + for (size_t i = 0; i < vars.size(); i++) { + queues.push_back(vars[i]->Get().GetQueue()); + } + return queues; +} + +template +class DataReaderCPUKernel: public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc new file mode 100644 index 00000000000000..d63de3e471f9ff --- /dev/null +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/file_label_loader_op.h" + +namespace paddle { +namespace operators { +namespace data { +// FileDataReaderWrapper reader_wrapper; + +// // initialization static variables out of ReaderManager +// ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; +// std::mutex ReaderManager::m_; + +class FileLabelLoaderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, + platform::errors::InvalidArgument( + "Input(Indices) of ReadFileLoaderOp is null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true, + platform::errors::InvalidArgument( + "Output(Image) of ReadFileLoaderOp is null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Label"), true, + platform::errors::InvalidArgument( + "Output(Label) of ReadFileLoaderOp is null.")); + + auto dim_indices = ctx->GetInputDim("Indices"); + PADDLE_ENFORCE_EQ(dim_indices.size(), 1, + platform::errors::InvalidArgument( + "Input(Indices) should be a 1-D Tensor")); + + auto files = ctx->Attrs().Get>("files"); + auto labels = ctx->Attrs().Get>("labels"); + PADDLE_ENFORCE_GT(files.size(), 0, + platform::errors::InvalidArgument( + "length of files should be greater than 0")); + PADDLE_ENFORCE_GT(labels.size(), 0, + platform::errors::InvalidArgument( + "length of labels should be greater than 0")); + PADDLE_ENFORCE_EQ(files.size(), labels.size(), + platform::errors::InvalidArgument( + "length of labels and files should be equal")); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } + +// private: +// void RunImpl(const framework::Scope& scope, +// const platform::Place& dev_place) const override { +// LOG(ERROR) << "FileLabelLoaderOp RunImpl start"; +// platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); +// auto& dev_ctx = *pool.Get(dev_place); +// framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); +// framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); +// +// auto* out = scope.FindVar(Output("Out")); +// auto out_queue = out->Get().GetQueue(); +// if (out_queue == nullptr) { +// LOG(ERROR) << "FileLabelLoaderOp init output queue"; +// auto* holder = out->template GetMutable(); +// holder->InitOnce(2); +// out_queue = holder->GetQueue(); +// } +// +// auto* out_label = scope.FindVar(Output("Label")); +// auto out_label_queue = +// out_label->Get().GetQueue(); +// if (out_label_queue == nullptr) { +// LOG(ERROR) << "FileLabelLoaderOp init output label queue"; +// auto* label_holder = +// out_label->template GetMutable(); +// label_holder->InitOnce(2); +// out_label_queue = label_holder->GetQueue(); +// } +// +// ReaderManager::Instance()->GetReader( +// 0, ctx, out_queue.get(), out_label_queue.get()); +// // LoDTensorArray samples = reader_wrapper.reader->Next(); +// // framework::LoDTensorArray out_array; +// // out_array.resize(samples.size()); +// // for (size_t i = 0; i < samples.size(); ++i) { +// // copy_tensor(samples[i], &out_array[i]); +// // } +// // out_queue->Push(out_array); +// LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; +// } + + // std::shared_ptr reader=nullptr; +}; + +class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Indices", "The batch indices of input samples"); + AddOutput("Image", "The output image tensor of ReadFileLoader op"); + AddOutput("Label", "The output label tensor of ReadFileLoader op"); + AddAttr>("files", "Path of the file to be readed.") + .SetDefault({}); + AddAttr>("labels", "Path of the file to be readed.") + .SetDefault({}); + AddComment(R"DOC( +This operator read a file. +)DOC"); + // AddAttr("root_dir", "Path of the file to be readed.") + // .SetDefault(""); + // AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); + // AddAttr("rank", "Path of the file to be readed.").SetDefault(0); + // AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); + // AddAttr("reader_id", + // "(int64_t)" + // "The unique hash id used as cache key for " + // "ExecutorInfoCache").SetDefault(0);; + } +}; + +// class FileLabelReaderInferShape : public framework::InferShapeBase { +// public: +// void operator()(framework::InferShapeContext* context) const override { +// OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", +// "FileLabelReader"); +// } +// }; +// +// class FileLabelReaderInferVarType : public framework::VarTypeInference { +// public: +// void operator()(framework::InferVarTypeContext* ctx) const override { +// ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, +// framework::ALL_ELEMENTS); +// } +// }; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::data; + +REGISTER_OPERATOR( + file_label_loader, ops::FileLabelLoaderOp, ops::FileLabelLoaderOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(file_label_loader, ops::FileLabelLoaderCPUKernel) diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h new file mode 100644 index 00000000000000..8dbd8bc0c26eea --- /dev/null +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -0,0 +1,308 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace data { +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; + +// class FileDataReader { +// public: +// explicit FileDataReader(const framework::ExecutionContext& ctx, +// LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) +// : queue_(queue), label_queue_(label_queue){ +// std::vector files = +// ctx.Attr>("files"); +// std::vector labels = ctx.Attr>("labels"); +// rank_ = ctx.Attr("rank"); +// world_size_ = ctx.Attr("world_size"); +// +// batch_size_ = ctx.Attr("batch_size"); +// current_epoch_ = 0; +// current_iter_ = 0; +// // iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); +// auto total_batch_size = batch_size_ * world_size_; +// iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; +// is_closed_ = false; +// for (int i = 0, n = files.size(); i < n; i++) +// image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); +// StartLoadThread(); +// } +// +// int GetStartIndex() { +// int start_idx = +// batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + +// rank_ * batch_size_; +// current_iter_++; +// return start_idx; +// } +// +// framework::LoDTensor ReadSample(const std::string filename) { +// std::ifstream input(filename.c_str(), +// std::ios::in | std::ios::binary | std::ios::ate); +// std::streamsize file_size = input.tellg(); +// +// input.seekg(0, std::ios::beg); +// +// // auto* out = ctx.Output("Out"); +// framework::LoDTensor out; +// std::vector out_shape = {file_size}; +// out.Resize(framework::make_ddim(out_shape)); +// +// uint8_t* data = out.mutable_data(platform::CPUPlace()); +// +// input.read(reinterpret_cast(data), file_size); +// return out; +// } +// +// void StartLoadThread() { +// if (load_thrd_.joinable()) { +// return; +// } +// +// load_thrd_ = std::thread([this] { +// while (!is_closed_.load()) LoadBatch(); +// }); +// } +// +// void ShutDown() { +// if (queue_ && !queue_->IsClosed()) queue_->Close(); +// if (label_queue_ && !label_queue_->IsClosed()) label_queue_->Close(); +// +// is_closed_.store(true); +// if (load_thrd_.joinable()) { +// load_thrd_.join(); +// } +// } +// +// +// std::pair> Read() { +// LoDTensorArray ret; +// std::vector label; +// ret.reserve(batch_size_); +// int start_index = GetStartIndex(); +// for (int32_t i = start_index; i < start_index + batch_size_; ++i) { +// if (static_cast(i) >= image_label_pairs_.size()) { +// // FIXME(dkp): refine close pipeline +// while (queue_->Size()) sleep(0.5); +// queue_->Close(); +// while (label_queue_->Size()) sleep(0.5); +// label_queue_->Close(); +// +// is_closed_.store(true); +// break; +// } +// i %= image_label_pairs_.size(); +// framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); +// ret.push_back(std::move(tmp)); +// label.push_back(image_label_pairs_[i].second); +// } +// return std::make_pair(ret, label); +// } +// +// +// void LoadBatch() { +// +// auto batch_data = std::move(Read()); +// queue_->Push(batch_data.first); +// framework::LoDTensor label_tensor; +// LoDTensorArray label_array; +// // auto& label_tensor = label.GetMutable(); +// label_tensor.Resize( +// framework::make_ddim({static_cast(batch_data.first.size())})); +// platform::CPUPlace cpu; +// auto* label_data = label_tensor.mutable_data(cpu); +// for (size_t i = 0; i < batch_data.first.size(); ++i) { +// label_data[i] = batch_data.second[i]; +// } +// label_array.push_back(label_tensor); +// label_queue_->Push(label_array); +// } +// +// private: +// int batch_size_; +// std::string file_root_, file_list_; +// std::vector> image_label_pairs_; +// int current_epoch_; +// int current_iter_; +// int rank_; +// int world_size_; +// int iters_per_epoch_; +// std::atomic is_closed_; +// Buffer batch_buffer_; +// std::thread load_thrd_; +// LoDTensorBlockingQueue* queue_; +// LoDTensorBlockingQueue* label_queue_; +// }; +// +// +// class ReaderManager { +// // PipelineManager is a signleton manager for Pipeline, we +// // create single Pipeline for a program id +// private: +// DISABLE_COPY_AND_ASSIGN(ReaderManager); +// +// static ReaderManager *rm_instance_ptr_; +// static std::mutex m_; +// +// std::map> prog_id_to_reader_; +// +// public: +// static ReaderManager *Instance() { +// if (rm_instance_ptr_ == nullptr) { +// std::lock_guard lk(m_); +// if (rm_instance_ptr_ == nullptr) { +// rm_instance_ptr_ = new ReaderManager; +// } +// } +// return rm_instance_ptr_; +// } +// +// // FileDataReader* GetReader( +// void GetReader( +// int64_t program_id, const framework::ExecutionContext& ctx, +// LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { +// auto iter = prog_id_to_reader_.find(program_id); +// if (iter == prog_id_to_reader_.end()) { +// prog_id_to_reader_[program_id] = std::unique_ptr(new FileDataReader(ctx, queue, label_queue)); +// // return prog_id_to_reader_[program_id].get(); +// } else { +// // return iter->second.get(); +// } +// } +// +// void ShutDown() { +// auto iter = prog_id_to_reader_.begin(); +// while (iter != prog_id_to_reader_.end()){ +// if(iter->second.get()){ +// iter->second->ShutDown(); +// } +// iter++; +// } +// prog_id_to_reader_.clear(); +// } +// +// ReaderManager() { VLOG(1) << "ReaderManager init"; } +// +// ~ReaderManager() { +// VLOG(1) << "~ReaderManager"; +// ShutDown(); +// } +// }; + +template +class FileLabelLoaderCPUKernel: public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "FileLabelLoaderOp RunImpl start"; + auto* indices = ctx.Input("Indices"); + auto* image_arr = ctx.Output("Image"); + auto* label_arr = ctx.Output("Label"); + + auto files = ctx.Attr>("files"); + auto labels = ctx.Attr>("labels"); + + auto batch_size = indices->dims()[0]; + const int* indices_data = indices->data(); + + image_arr->reserve(batch_size); + std::vector label_vec; + label_vec.reserve(batch_size); + for (int i = 0; i < batch_size; i++) { + int index = indices_data[i]; + std::ifstream input(files[index].c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + std::streamsize file_size = input.tellg(); + + input.seekg(0, std::ios::beg); + + framework::LoDTensor image; + std::vector image_len = {file_size}; + image.Resize(framework::make_ddim(image_len)); + + uint8_t* data = image.mutable_data(platform::CPUPlace()); + + input.read(reinterpret_cast(data), file_size); + + image_arr->emplace_back(image); + label_vec.emplace_back(labels[index]); + } + + framework::LoDTensor label_tensor; + label_tensor.Resize( + framework::make_ddim({static_cast(label_vec.size())})); + auto* label_data = label_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < batch_size; i++) label_data[i] = label_vec[i]; + + label_arr->reserve(1); + label_arr->emplace_back(label_tensor); + LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; + + // auto out_queue = out->Get().GetQueue(); + // if (out_queue == nullptr) { + // LOG(ERROR) << "FileLabelLoaderOp init output queue"; + // auto* holder = out->template GetMutable(); + // holder->InitOnce(2); + // out_queue = holder->GetQueue(); + // } + // + // auto* out_label = scope.FindVar(Output("Label")); + // auto out_label_queue = + // out_label->Get().GetQueue(); + // if (out_label_queue == nullptr) { + // LOG(ERROR) << "FileLabelLoaderOp init output label queue"; + // auto* label_holder = + // out_label->template GetMutable(); + // label_holder->InitOnce(2); + // out_label_queue = label_holder->GetQueue(); + // } + + // ReaderManager::Instance()->GetReader( + // 0, ctx, out_queue.get(), out_label_queue.get()); + // LoDTensorArray samples = reader_wrapper.reader->Next(); + // framework::LoDTensorArray out_array; + // out_array.resize(samples.size()); + // for (size_t i = 0; i < samples.size(); ++i) { + // copy_tensor(samples[i], &out_array[i]); + // } + // out_queue->Push(out_array); + } + + private: + void copy_tensor(const framework::LoDTensor& lod_tensor, + framework::LoDTensor* out) const { + if (lod_tensor.numel() == 0) return; + auto& out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/file_label_reader_op.cc b/paddle/fluid/operators/data/file_label_reader_op.cc deleted file mode 100644 index d6648ed86e4099..00000000000000 --- a/paddle/fluid/operators/data/file_label_reader_op.cc +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/data/file_label_reader_op.h" - -namespace paddle { -namespace operators { -namespace data { -// FileDataReaderWrapper reader_wrapper; - -// initialization static variables out of ReaderManager -ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; -std::mutex ReaderManager::m_; - -template -class CPUFileLabelKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override {} -}; - -class FileLabelReaderOp : public framework::OperatorBase { - public: - // using framework::OperatorWithKernel::OperatorWithKernel; - FileLabelReaderOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ReadFileOp is null.")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - return framework::OpKernelType(framework::proto::VarType::UINT8, - platform::CPUPlace()); - } - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - LOG(ERROR) << "FileLabelReaderOp RunImpl start"; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& dev_ctx = *pool.Get(dev_place); - framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); - framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); - - auto* out = scope.FindVar(Output("Out")); - auto out_queue = out->Get().GetQueue(); - if (out_queue == nullptr) { - LOG(ERROR) << "FileLabelReaderOp init output queue"; - auto* holder = out->template GetMutable(); - holder->InitOnce(2); - out_queue = holder->GetQueue(); - } - - auto* out_label = scope.FindVar(Output("Label")); - auto out_label_queue = - out_label->Get().GetQueue(); - if (out_label_queue == nullptr) { - LOG(ERROR) << "FileLabelReaderOp init output label queue"; - auto* label_holder = - out_label->template GetMutable(); - label_holder->InitOnce(2); - out_label_queue = label_holder->GetQueue(); - } - - // if (reader_wrapper.reader == nullptr) { - // // create reader - // reader_wrapper.SetUp(ctx, out_queue.get(), out_label_queue.get()); - // } - ReaderManager::Instance()->GetReader( - 0, ctx, out_queue.get(), out_label_queue.get()); - // LoDTensorArray samples = reader_wrapper.reader->Next(); - // framework::LoDTensorArray out_array; - // out_array.resize(samples.size()); - // for (size_t i = 0; i < samples.size(); ++i) { - // copy_tensor(samples[i], &out_array[i]); - // } - // out_queue->Push(out_array); - LOG(ERROR) << "FileLabelReaderOp RunImpl finish"; - } - - void copy_tensor(const framework::LoDTensor& lod_tensor, - framework::LoDTensor* out) const { - if (lod_tensor.numel() == 0) return; - auto& out_tensor = *out; - TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); - out_tensor.set_lod(lod_tensor.lod()); - } - - // std::shared_ptr reader=nullptr; -}; - -class FileLabelReaderOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddOutput("Out", "The output tensor of ReadFile op"); - AddOutput("Label", "The output tensor of ReadFile op"); - AddComment(R"DOC( -This operator read a file. -)DOC"); - AddAttr("root_dir", "Path of the file to be readed.") - .SetDefault(""); - AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); - AddAttr("rank", "Path of the file to be readed.").SetDefault(0); - AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); - AddAttr("reader_id", - "(int64_t)" - "The unique hash id used as cache key for " - "ExecutorInfoCache").SetDefault(0);; - AddAttr>("files", "Path of the file to be readed.") - .SetDefault({}); - AddAttr>("labels", "Path of the file to be readed.") - .SetDefault({}); - } -}; - -class FileLabelReaderInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* context) const override { - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", - "FileLabelReader"); - } -}; - -class FileLabelReaderInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::data; - -REGISTER_OPERATOR( - file_label_reader, ops::FileLabelReaderOp, ops::FileLabelReaderOpMaker, - ops::FileLabelReaderInferShape, ops::FileLabelReaderInferVarType, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker) - -REGISTER_OP_CPU_KERNEL(file_label_reader, ops::CPUFileLabelKernel) diff --git a/paddle/fluid/operators/data/file_label_reader_op.h b/paddle/fluid/operators/data/file_label_reader_op.h deleted file mode 100644 index a0e70455ce8663..00000000000000 --- a/paddle/fluid/operators/data/file_label_reader_op.h +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" - -namespace paddle { -namespace operators { -namespace data { -using LoDTensorArray = framework::LoDTensorArray; -using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; - -enum BufferStatus { - kBufferStatusSuccess = 0, - kBufferStatusErrorClosed, - kBufferStatusEmpty -}; - -template -class Buffer final { - public: - explicit Buffer(size_t max_len = 2) : max_len_(max_len), is_closed_(false) {} - ~Buffer() = default; - - BufferStatus Push(const T& item); - BufferStatus Pull(T* item); - BufferStatus TryReceive(T* item); - void Close(); - - private: - std::queue queue_; - mutable std::mutex mutex_; - size_t max_len_; - bool is_closed_; - std::condition_variable cond_; -}; - -template -BufferStatus Buffer::Push(const T& item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return queue_.size() < max_len_ || is_closed_; }); - if (is_closed_) { - return kBufferStatusErrorClosed; - } - - queue_.push(item); - cond_.notify_one(); - return kBufferStatusSuccess; -} - -template -BufferStatus Buffer::Pull(T* item) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this]() { return (!queue_.empty()) || is_closed_; }); - if (queue_.empty()) { - return kBufferStatusErrorClosed; - } - *item = queue_.front(); - queue_.pop(); - if (queue_.size() < max_len_) { - cond_.notify_all(); - } - return kBufferStatusSuccess; -} - -template -void Buffer::Close() { - std::unique_lock lock(mutex_); - is_closed_ = true; - cond_.notify_all(); -} - -class FileDataReader { - public: - explicit FileDataReader(const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) - : queue_(queue), label_queue_(label_queue){ - std::vector files = - ctx.Attr>("files"); - std::vector labels = ctx.Attr>("labels"); - rank_ = ctx.Attr("rank"); - world_size_ = ctx.Attr("world_size"); - - batch_size_ = ctx.Attr("batch_size"); - current_epoch_ = 0; - current_iter_ = 0; - // iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); - auto total_batch_size = batch_size_ * world_size_; - iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; - is_closed_ = false; - for (int i = 0, n = files.size(); i < n; i++) - image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); - StartLoadThread(); - } - - int GetStartIndex() { - int start_idx = - batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + - rank_ * batch_size_; - current_iter_++; - return start_idx; - } - - framework::LoDTensor ReadSample(const std::string filename) { - std::ifstream input(filename.c_str(), - std::ios::in | std::ios::binary | std::ios::ate); - std::streamsize file_size = input.tellg(); - - input.seekg(0, std::ios::beg); - - // auto* out = ctx.Output("Out"); - framework::LoDTensor out; - std::vector out_shape = {file_size}; - out.Resize(framework::make_ddim(out_shape)); - - uint8_t* data = out.mutable_data(platform::CPUPlace()); - - input.read(reinterpret_cast(data), file_size); - return out; - } - - void StartLoadThread() { - if (load_thrd_.joinable()) { - return; - } - - load_thrd_ = std::thread([this] { - while (!is_closed_.load()) LoadBatch(); - }); - } - - void ShutDown() { - if (queue_ && !queue_->IsClosed()) queue_->Close(); - if (label_queue_ && !label_queue_->IsClosed()) label_queue_->Close(); - - is_closed_.store(true); - if (load_thrd_.joinable()) { - load_thrd_.join(); - } - } - - - std::pair> Read() { - LoDTensorArray ret; - std::vector label; - ret.reserve(batch_size_); - int start_index = GetStartIndex(); - for (int32_t i = start_index; i < start_index + batch_size_; ++i) { - if (static_cast(i) >= image_label_pairs_.size()) { - // FIXME(dkp): refine close pipeline - while (queue_->Size()) sleep(0.5); - queue_->Close(); - while (label_queue_->Size()) sleep(0.5); - label_queue_->Close(); - - is_closed_.store(true); - break; - } - i %= image_label_pairs_.size(); - framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); - ret.push_back(std::move(tmp)); - label.push_back(image_label_pairs_[i].second); - } - return std::make_pair(ret, label); - } - - - void LoadBatch() { - - auto batch_data = std::move(Read()); - queue_->Push(batch_data.first); - framework::LoDTensor label_tensor; - LoDTensorArray label_array; - // auto& label_tensor = label.GetMutable(); - label_tensor.Resize( - framework::make_ddim({static_cast(batch_data.first.size())})); - platform::CPUPlace cpu; - auto* label_data = label_tensor.mutable_data(cpu); - for (size_t i = 0; i < batch_data.first.size(); ++i) { - label_data[i] = batch_data.second[i]; - } - label_array.push_back(label_tensor); - label_queue_->Push(label_array); - } - - private: - int batch_size_; - std::string file_root_, file_list_; - std::vector> image_label_pairs_; - int current_epoch_; - int current_iter_; - int rank_; - int world_size_; - int iters_per_epoch_; - std::atomic is_closed_; - Buffer batch_buffer_; - std::thread load_thrd_; - LoDTensorBlockingQueue* queue_; - LoDTensorBlockingQueue* label_queue_; -}; - - -class ReaderManager { - // PipelineManager is a signleton manager for Pipeline, we - // create single Pipeline for a program id - private: - DISABLE_COPY_AND_ASSIGN(ReaderManager); - - static ReaderManager *rm_instance_ptr_; - static std::mutex m_; - - std::map> prog_id_to_reader_; - - public: - static ReaderManager *Instance() { - if (rm_instance_ptr_ == nullptr) { - std::lock_guard lk(m_); - if (rm_instance_ptr_ == nullptr) { - rm_instance_ptr_ = new ReaderManager; - } - } - return rm_instance_ptr_; - } - - // FileDataReader* GetReader( - void GetReader( - int64_t program_id, const framework::ExecutionContext& ctx, - LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { - auto iter = prog_id_to_reader_.find(program_id); - if (iter == prog_id_to_reader_.end()) { - prog_id_to_reader_[program_id] = std::unique_ptr(new FileDataReader(ctx, queue, label_queue)); - // return prog_id_to_reader_[program_id].get(); - } else { - // return iter->second.get(); - } - } - - void ShutDown() { - auto iter = prog_id_to_reader_.begin(); - while (iter != prog_id_to_reader_.end()){ - if(iter->second.get()){ - iter->second->ShutDown(); - } - iter++; - } - prog_id_to_reader_.clear(); - } - - ReaderManager() { VLOG(1) << "ReaderManager init"; } - - ~ReaderManager() { - VLOG(1) << "~ReaderManager"; - ShutDown(); - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 9e8d93305b4031..70f26457e963ee 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -89,7 +89,6 @@ class MapInferVarType : public framework::VarTypeInference { void operator()(framework::InferVarTypeContext* ctx) const override {} }; - class MapOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index 8fb3d277290c56..e491a18daf500e 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -24,7 +24,6 @@ namespace operators { using BlockDesc = framework::BlockDesc; using Scope = framework::Scope; -using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 931fb1411085cf..5255f5bc5a01ad 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -33,7 +33,7 @@ NvjpegDecoder::NvjpegDecoder(std::string mode, int dev_id) // pinned_allocator_.pinned_malloc = &cudaMallocHost; // pinned_allocator_.pinned_free = &cudaFreeHost; PADDLE_ENFORCE_NVJPEG_SUCCESS( - platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &device_allocator_, + platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_HYBRID, &device_allocator_, &pinned_allocator_, 0, &handle_)); for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index fead833296fc45..79226d8bd6e4c1 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include "paddle/fluid/operators/data/file_label_reader_op.h" +#include "paddle/fluid/operators/data/file_label_loader_op.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" @@ -29,10 +29,10 @@ extern NvjpegDecoderThreadPool* decode_pool; void ShutDownDataLoader() { LOG(ERROR) << "ShutDownDataLoader enter"; - // step 1: shutdown reader - // reader_wrapper.ShutDown(); - ReaderManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; + // // step 1: shutdown reader + // // reader_wrapper.ShutDown(); + // ReaderManager::Instance()->ShutDown(); + // LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 357035e23618e1..0a0d5b43de8d35 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -30,7 +30,7 @@ def _to_list(l): return [l] -class MapGuard(object): +class _ProgramGuard(object): def __init__(self, main_program): if not isinstance(main_program, Program): raise TypeError("MapGuard should init with a Program") @@ -67,7 +67,7 @@ def map(map_func, inputs): # build map block main_program = helper.main_program - with MapGuard(main_program): + with _ProgramGuard(main_program): program_id = _hash_with_id(main_program, map_func) map_block = main_program.current_block() @@ -111,3 +111,57 @@ def map(map_func, inputs): attrs=attrs) return outputs + + +def data_reader(reader_func, + batch_size=1, + num_samples=1, + shuffle=False, + drop_last=False): + assert not in_dygraph_mode(), \ + "paddle.io.data_reader can only be used in static mode" + helper = LayerHelper("data_reader", **locals()) + + # build reader block + main_program = helper.main_program + with _ProgramGuard(main_program): + program_id = _hash_with_id(main_program, reader_func) + reader_block = main_program.current_block() + + indices_var = reader_block.create_var( + name=unique_name.generate("data_reader_sub"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype="uint8", + persistable=False) + program_outputs = reader_func(indices_var) + program_outputs = _to_list(program_outputs) + + indices_var_name = indices_var.name + output_var_names = [v.name for v in program_outputs] + + outputs = \ + [helper.create_variable( + name=unique_name.generate("map"), + type=outp.desc.type(), + persistable=True) for outp in program_outputs] + + attrs = { + "reader_id": reader_id, + "reader_block": reader_block, + "indices_var_name": indices_var_name, + "output_var_names": output_var_names, + "batch_size": batch_size, + "num_samples": num_samples, + "shuffle": shuffle, + "drop_last": drop_last, + "rank": paddle.distributed.get_rank(), + "world_size": paddle.distributed.get_world_size() + } + + helper.append_op( + type="data_reader", + inputs={}, + outputs={"Out": outputs}, + attrs=attrs) + + return outputs diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index b6a186279806f2..2dc50b7ca86b88 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -867,6 +867,53 @@ def read_file(filename, name=None): return out +def file_label_loader(data_root, indices, name=None): + """ + Reads a batch of data, outputs the bytes contents of a file + as a uint8 Tensor with one dimension. + + Args: + data_root (str): root directory of data + indices (list of int): batch indices of samples + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + """ + from paddle.vision.datasets import DatasetFolder + data_folder = DatasetFolder(data_root) + samples = [s[0] for s in data_folder.samples] + targets = [s[1] for s in data_folder.samples] + + if in_dygraph_mode(): + return _C_ops.file_label_loader(indices, 'files', samples, 'labels', targets) + + inputs = {"Indices": indices} + attrs = { + 'files': samples, + 'labels': targets, + } + + helper = LayerHelper("file_label_loader", **locals()) + image = helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype='uint8') + + label = helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype='int') + + helper.append_op( + type="file_label_loader", + inputs=inputs, + attrs=attrs, + outputs={"Image": image, + "Label": label}) + + return image, label + + def file_label_reader(file_root, batch_size, name=None): """ Reads and outputs the bytes contents of a file as a uint8 Tensor From 4dd29d59afd70454ab702d37b18e2ec0bf9040e2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 19 Jan 2022 16:50:18 +0000 Subject: [PATCH 48/95] run success, hang to fix --- paddle/fluid/operators/data/data_reader_op.cc | 29 ++++--- paddle/fluid/operators/data/data_reader_op.h | 21 +++-- .../operators/data/file_label_loader_op.h | 6 +- python/paddle/fluid/dataloader/ops.py | 8 +- python/paddle/io/__init__.py | 4 +- python/paddle/vision/ops.py | 78 ++++++++++--------- 6 files changed, 87 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 955e12e28be522..7a637b64e35eb4 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -34,13 +34,13 @@ class DataReaderOp : public framework::OperatorBase { OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "DataReaderOp"); } - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - return framework::OpKernelType(framework::proto::VarType::FP32, - ctx.GetPlace()); - } - +// protected: +// framework::OpKernelType GetExpectedKernelType( +// const framework::ExecutionContext& ctx) const { +// return framework::OpKernelType(framework::proto::VarType::FP32, +// ctx.GetPlace()); +// } +// private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { @@ -72,6 +72,18 @@ class DataReaderOp : public framework::OperatorBase { } }; +class DataReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); + } +}; + +class DataReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override {} +}; + class DataReaderOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -113,7 +125,6 @@ namespace ops = paddle::operators::data; REGISTER_OPERATOR( data_reader, ops::DataReaderOp, ops::DataReaderOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker) + ops::DataReaderInferShape, ops::DataReaderInferVarType) REGISTER_OP_CPU_KERNEL(data_reader, ops::DataReaderCPUKernel) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 2bdd2a9ff8085e..d59ab369abfd8b 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -123,8 +123,17 @@ class DataReader { while (running_.load()) { std::vector indices; sampler_.GetNextIndices(&indices); + LOG(ERROR) << "DataReaderOp thread got indices " << indices.size(); // shutdown reader if indices drained - if (indices.size() == 0) ShutDown(); + if (indices.size() == 0) { + for(auto& queue: output_queues_) { + while (queue->Size()) sleep(0.5); + queue->Close(); + } + + running_.store(false); + return; + } ShareIndicesIntoScope(&scope_, indices); @@ -158,6 +167,7 @@ class DataReader { output_queues_[i]->Push(t_arr); } } + LOG(ERROR) << "ReaderThread output"; } scope->DeleteScope(&scope_); }); @@ -175,12 +185,11 @@ class DataReader { void ShareIndicesIntoScope(Scope* scope, std::vector indices) { - // get indices variable from scope auto* var = scope->Var(indices_var_name_); auto* indices_tensor = var->GetMutable(); - indices_tensor->Resize(framework::make_ddim({batch_size_})); - auto* indices_data = indices_tensor->mutable_data(place_); + indices_tensor->Resize(framework::make_ddim({static_cast(indices.size())})); + auto* indices_data = indices_tensor->mutable_data(platform::CPUPlace()); for (size_t i = 0; i < indices.size(); i++) { indices_data[i] = indices[i]; @@ -272,13 +281,13 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa if (var->IsInitialized()) { PADDLE_ENFORCE_EQ(var->IsType(), true, platform::errors::InvalidArgument( - "Output Variables of MapOp should hold " + "Output Variables of DataLoaderOp should hold " "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); if (queue == nullptr) { auto* holder = var->template GetMutable(); holder->InitOnce(capacity); - LOG(ERROR) << "MapOpKernel init queue" << holder->GetQueue(); + LOG(ERROR) << "DataLoaderOpKernel init queue" << holder->GetQueue(); } } else { VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index 8dbd8bc0c26eea..217df6504e6afe 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -227,13 +227,13 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { auto labels = ctx.Attr>("labels"); auto batch_size = indices->dims()[0]; - const int* indices_data = indices->data(); + const int64_t* indices_data = indices->data(); image_arr->reserve(batch_size); std::vector label_vec; label_vec.reserve(batch_size); - for (int i = 0; i < batch_size; i++) { - int index = indices_data[i]; + for (int64_t i = 0; i < batch_size; i++) { + int64_t index = indices_data[i]; std::ifstream input(files[index].c_str(), std::ios::in | std::ios::binary | std::ios::ate); std::streamsize file_size = input.tellg(); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 0a0d5b43de8d35..f47e76599caca1 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -21,7 +21,7 @@ from ...common_ops_import import * -__all__ = ["map"] +__all__ = ["map", "data_reader"] def _to_list(l): @@ -125,13 +125,13 @@ def data_reader(reader_func, # build reader block main_program = helper.main_program with _ProgramGuard(main_program): - program_id = _hash_with_id(main_program, reader_func) + reader_id= _hash_with_id(main_program, reader_func) reader_block = main_program.current_block() indices_var = reader_block.create_var( name=unique_name.generate("data_reader_sub"), type=core.VarDesc.VarType.LOD_TENSOR, - dtype="uint8", + dtype="int64", persistable=False) program_outputs = reader_func(indices_var) program_outputs = _to_list(program_outputs) @@ -141,7 +141,7 @@ def data_reader(reader_func, outputs = \ [helper.create_variable( - name=unique_name.generate("map"), + name=unique_name.generate("data_reader"), type=outp.desc.type(), persistable=True) for outp in program_outputs] diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index 57fb31a723817d..8d362f69ee61b5 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -31,6 +31,7 @@ from ..fluid.dataloader import Subset # noqa: F401 from ..fluid.dataloader import random_split # noqa: F401 from ..fluid.dataloader import map # noqa: F401 +from ..fluid.dataloader import data_reader # noqa: F401 __all__ = [ #noqa 'Dataset', @@ -49,5 +50,6 @@ 'WeightedRandomSampler', 'random_split', 'Subset', - 'map' + 'map', + 'data_reader', ] diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 2dc50b7ca86b88..0c62c69382c60c 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -914,7 +914,10 @@ def file_label_loader(data_root, indices, name=None): return image, label -def file_label_reader(file_root, batch_size, name=None): +def file_label_reader(file_root, + batch_size=1, + shuffle=False, + drop_last=False): """ Reads and outputs the bytes contents of a file as a uint8 Tensor with one dimension. @@ -942,42 +945,45 @@ def file_label_reader(file_root, batch_size, name=None): samples = [s[0] for s in data_folder.samples] targets = [s[1] for s in data_folder.samples] - import time - unq_reader_id = int(round(time.time()* 1000*1000)) - - if in_dygraph_mode(): - return _C_ops.file_label_reader('root_dir', file_root, 'batch_size', - batch_size, 'files', samples, 'labels', - targets, 'reader_id', unq_reader_id) - - inputs = dict() - attrs = { - 'root_dir': file_root, - 'batch_size': batch_size, - 'files': samples, - 'labels': targets, - 'reader_id': unq_reader_id, - } - - helper = LayerHelper("file_label_reader", **locals()) - out = helper.create_variable( - name=unique_name.generate("file_label_reader"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype='uint8') - - label = helper.create_variable( - name=unique_name.generate("file_label_reader"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype='int') - - helper.append_op( - type="file_label_reader", - inputs=inputs, - attrs=attrs, - outputs={"Out": out, - "Label": label - }) + return _C_ops.file_label_loader(list(arange(batch_size)), "files", + samples, "labels", labels) + + def _reader(indices): + return file_label_loader(file_root, indices) + + return paddle.io.data_reader(_reader, + batch_size=batch_size, + num_samples=len(samples), + shuffle=shuffle, + drop_last=drop_last) + # inputs = dict() + # attrs = { + # 'root_dir': file_root, + # 'batch_size': batch_size, + # 'files': samples, + # 'labels': targets, + # 'reader_id': unq_reader_id, + # } + # + # helper = LayerHelper("file_label_reader", **locals()) + # out = helper.create_variable( + # name=unique_name.generate("file_label_reader"), + # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + # dtype='uint8') + # + # label = helper.create_variable( + # name=unique_name.generate("file_label_reader"), + # type=core.VarDesc.VarType.LOD_TENSOR, + # dtype='int') + # + # helper.append_op( + # type="file_label_reader", + # inputs=inputs, + # attrs=attrs, + # outputs={"Out": out, + # "Label": label + # }) return out, label From b6c2e1f549b75bd3057e95c1f5ba8cfaff521282 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 20 Jan 2022 03:54:27 +0000 Subject: [PATCH 49/95] fix speed --- .../data/batch_decode_random_crop_op.cu | 2 +- paddle/fluid/operators/data/data_reader_op.cc | 2 +- paddle/fluid/operators/data/data_reader_op.h | 5 +---- .../operators/data/file_label_loader_op.h | 18 ++++++------------ paddle/fluid/operators/data/shutdown.h | 12 ++++-------- python/paddle/vision/ops.py | 2 +- 6 files changed, 14 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index 0db44f0cc8bfb3..478ef5d676e966 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -31,7 +31,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int num_threads = ctx.Attr("num_threads"); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); // multi-phrase decode thread pool @@ -43,6 +42,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const framework::LoDTensorArray* inputs = ctx.Input("X"); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << inputs->size(); auto* out = ctx.OutputVar("Out"); auto dev = platform::CUDAPlace(local_rank); diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 7a637b64e35eb4..6cfbd0d12f9c33 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -66,7 +66,7 @@ class DataReaderOp : public framework::OperatorBase { auto output_queues = GetQueueVecFromVariableVec(output_vars); ReaderManager::Instance()->StartDataReader( - reader_id, reader_block, &scope, dev_place, indices_var_name, + reader_id, reader_block, &scope, platform::CPUPlace(), indices_var_name, output_var_names, output_queues, batch_size, num_samples, shuffle, drop_last, rank, world_size); } diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index d59ab369abfd8b..74c48c0a8b1d30 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -123,7 +123,6 @@ class DataReader { while (running_.load()) { std::vector indices; sampler_.GetNextIndices(&indices); - LOG(ERROR) << "DataReaderOp thread got indices " << indices.size(); // shutdown reader if indices drained if (indices.size() == 0) { for(auto& queue: output_queues_) { @@ -140,8 +139,7 @@ class DataReader { try { executor.Run(*reader_block_->Program(), &scope_, static_cast(reader_block_->ID()), - false, true, std::vector(), - false, true); + false, true, {}, false, true); } catch (...) { break; } @@ -167,7 +165,6 @@ class DataReader { output_queues_[i]->Push(t_arr); } } - LOG(ERROR) << "ReaderThread output"; } scope->DeleteScope(&scope_); }); diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index 217df6504e6afe..b8db6be900fb70 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -221,7 +221,7 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { LOG(ERROR) << "FileLabelLoaderOp RunImpl start"; auto* indices = ctx.Input("Indices"); auto* image_arr = ctx.Output("Image"); - auto* label_arr = ctx.Output("Label"); + auto* label_tensor = ctx.Output("Label"); auto files = ctx.Attr>("files"); auto labels = ctx.Attr>("labels"); @@ -229,9 +229,11 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { auto batch_size = indices->dims()[0]; const int64_t* indices_data = indices->data(); + image_arr->clear(); image_arr->reserve(batch_size); - std::vector label_vec; - label_vec.reserve(batch_size); + label_tensor->Resize( + framework::make_ddim({static_cast(batch_size)})); + auto* label_data = label_tensor->mutable_data(platform::CPUPlace()); for (int64_t i = 0; i < batch_size; i++) { int64_t index = indices_data[i]; std::ifstream input(files[index].c_str(), @@ -249,17 +251,9 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { input.read(reinterpret_cast(data), file_size); image_arr->emplace_back(image); - label_vec.emplace_back(labels[index]); + label_data[i] = labels[index]; } - framework::LoDTensor label_tensor; - label_tensor.Resize( - framework::make_ddim({static_cast(label_vec.size())})); - auto* label_data = label_tensor.mutable_data(platform::CPUPlace()); - for (int i = 0; i < batch_size; i++) label_data[i] = label_vec[i]; - - label_arr->reserve(1); - label_arr->emplace_back(label_tensor); LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; // auto out_queue = out->Get().GetQueue(); diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 79226d8bd6e4c1..222b4c31bb50b3 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -13,26 +13,22 @@ // limitations under the License. #pragma once -#include "paddle/fluid/operators/data/file_label_loader_op.h" +#include "paddle/fluid/operators/data/data_reader_op.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" namespace paddle { namespace operators { - -// extern FileDataReaderWrapper reader_wrapper; - namespace data { extern NvjpegDecoderThreadPool* decode_pool; void ShutDownDataLoader() { LOG(ERROR) << "ShutDownDataLoader enter"; - // // step 1: shutdown reader - // // reader_wrapper.ShutDown(); - // ReaderManager::Instance()->ShutDown(); - // LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; + // step 1: shutdown reader + ReaderManager::Instance()->ShutDown(); + LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 0c62c69382c60c..5b025be5394808 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -901,7 +901,7 @@ def file_label_loader(data_root, indices, name=None): label = helper.create_variable( name=unique_name.generate("file_label_loader"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + type=core.VarDesc.VarType.LOD_TENSOR, dtype='int') helper.append_op( From fd14988941981f9546fe17b80734f0888c375fd0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 24 Jan 2022 09:00:41 +0000 Subject: [PATCH 50/95] support dygraph running --- paddle/fluid/pybind/op_function_generator.cc | 3 + python/paddle/fluid/dataloader/ops.py | 7 +- python/paddle/vision/__init__.py | 1 + python/paddle/vision/ops.py | 138 ++---------- python/paddle/vision/reader.py | 211 +++++++++++++++++++ 5 files changed, 234 insertions(+), 126 deletions(-) create mode 100644 python/paddle/vision/reader.py diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 260541931aa9fe..186afa85bbff15 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -189,6 +189,9 @@ std::map> op_passing_outs_map = { {"run_program", {"Out", "DOut", "OutScope"}}, {"dataloader", {"Out"}}, {"map", {"Out"}}, + {"file_label_loader", {"Image"}}, + {"batch_decode", {"Out"}}, + {"batch_decode_random_crop", {"Out"}}, {"clear_float_status", {"FloatStatusOut"}}, {"get_float_status", {"FloatStatusOut"}}, }; diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index f47e76599caca1..092c8db974a5ac 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -61,8 +61,10 @@ def _generate_stream_id(): def map(map_func, inputs): - assert not in_dygraph_mode(), \ - "paddle.io.map can only be used in static mode" + inputs = _to_list(inputs) + if in_dygraph_mode(): + return map_func(*inputs) + helper = LayerHelper("map", **locals()) # build map block @@ -71,7 +73,6 @@ def map(map_func, inputs): program_id = _hash_with_id(main_program, map_func) map_block = main_program.current_block() - inputs = _to_list(inputs) program_inputs = [ map_block.create_var( name=unique_name.generate("map_sub"), diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index 54f293d7f57d10..02a8a2af031ad3 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -17,6 +17,7 @@ from . import transforms # noqa: F401 from . import datasets # noqa: F401 from . import ops # noqa: F401 +from . import reader # noqa: F401 from .image import set_image_backend # noqa: F401 from .image import get_image_backend # noqa: F401 from .image import image_load # noqa: F401 diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 5b025be5394808..aa5011baaf1d8b 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -867,127 +867,6 @@ def read_file(filename, name=None): return out -def file_label_loader(data_root, indices, name=None): - """ - Reads a batch of data, outputs the bytes contents of a file - as a uint8 Tensor with one dimension. - - Args: - data_root (str): root directory of data - indices (list of int): batch indices of samples - name (str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name`. - """ - from paddle.vision.datasets import DatasetFolder - data_folder = DatasetFolder(data_root) - samples = [s[0] for s in data_folder.samples] - targets = [s[1] for s in data_folder.samples] - - if in_dygraph_mode(): - return _C_ops.file_label_loader(indices, 'files', samples, 'labels', targets) - - inputs = {"Indices": indices} - attrs = { - 'files': samples, - 'labels': targets, - } - - helper = LayerHelper("file_label_loader", **locals()) - image = helper.create_variable( - name=unique_name.generate("file_label_loader"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype='uint8') - - label = helper.create_variable( - name=unique_name.generate("file_label_loader"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype='int') - - helper.append_op( - type="file_label_loader", - inputs=inputs, - attrs=attrs, - outputs={"Image": image, - "Label": label}) - - return image, label - - -def file_label_reader(file_root, - batch_size=1, - shuffle=False, - drop_last=False): - """ - Reads and outputs the bytes contents of a file as a uint8 Tensor - with one dimension. - - Args: - filename (str): Path of the file to be read. - name (str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name`. - - Returns: - A uint8 tensor. - - Examples: - .. code-block:: python - - import cv2 - import paddle - - image = paddle.vision.ops.file_label_reader('/workspace/datasets/ILSVRC2012/val/', 2) - - """ - from paddle.vision.datasets import DatasetFolder - data_folder = DatasetFolder(file_root) - samples = [s[0] for s in data_folder.samples] - targets = [s[1] for s in data_folder.samples] - - if in_dygraph_mode(): - return _C_ops.file_label_loader(list(arange(batch_size)), "files", - samples, "labels", labels) - - def _reader(indices): - return file_label_loader(file_root, indices) - - return paddle.io.data_reader(_reader, - batch_size=batch_size, - num_samples=len(samples), - shuffle=shuffle, - drop_last=drop_last) - # inputs = dict() - # attrs = { - # 'root_dir': file_root, - # 'batch_size': batch_size, - # 'files': samples, - # 'labels': targets, - # 'reader_id': unq_reader_id, - # } - # - # helper = LayerHelper("file_label_reader", **locals()) - # out = helper.create_variable( - # name=unique_name.generate("file_label_reader"), - # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - # dtype='uint8') - # - # label = helper.create_variable( - # name=unique_name.generate("file_label_reader"), - # type=core.VarDesc.VarType.LOD_TENSOR, - # dtype='int') - # - # helper.append_op( - # type="file_label_reader", - # inputs=inputs, - # attrs=attrs, - # outputs={"Out": out, - # "Label": label - # }) - - return out, label - - def image_decode(x, mode='unchanged', num_threads=2, name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. @@ -1025,8 +904,11 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): local_rank = paddle.distributed.get_rank() if in_dygraph_mode(): + out = core.VarBase(core.VarDesc.VarType.UINT8, [], + unique_name.generate("image_decode"), + core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) return _C_ops.batch_decode( - x, "mode", mode, "num_threads", num_threads, + x, out, "mode", mode, "num_threads", num_threads, "local_rank", local_rank) inputs = {'X': x} @@ -1093,8 +975,11 @@ def image_decode_random_crop(x, """ local_rank = paddle.distributed.get_rank() if in_dygraph_mode(): + out = core.VarBase(core.VarDesc.VarType.UINT8, [], + unique_name.generate("image_decode_random_crop"), + core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) return _C_ops.batch_decode_random_crop( - x, "mode", mode, "num_threads", num_threads, + x, out, "mode", mode, "num_threads", num_threads, "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, "area_max", area_max, @@ -1123,6 +1008,13 @@ def image_decode_random_crop(x, def random_flip(x, batch_size, prob=0.5, name=None): + if in_dygraph_mode(): + p = np.random.uniform(0., 1., [batch_size]) + for i in range(batch_size): + if p[i] < prob: + x[i] = paddle.flip(x[i], -1) + return x + if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py new file mode 100644 index 00000000000000..928622395256be --- /dev/null +++ b/python/paddle/vision/reader.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from ..fluid.layer_helper import LayerHelper, unique_name +from ..fluid import core, layers +from ..fluid.layers import nn, utils + +import paddle +from paddle.common_ops_import import * + +__all__ = [ #noqa + 'file_label_loader', + 'file_label_reader', +] + + +class _Sampler(object): + def __init__(self, batch_size, num_samples, + shuffle=False, drop_last=False): + self.batch_size = batch_size + self.drop_last = drop_last + self.num_samples = num_samples + self.start_idx = 0 + + self.sample_ids = np.arange(num_samples) + if shuffle: + np.random.shuffle(self.sample_ids) + + def __next__(self): + if self.start_idx >= self.num_samples: + self.reset() + return self.__next__() + + batch_len = min(self.batch_size, self.num_samples - self.start_idx) + indices = self.sample_ids[self.start_idx:self.start_idx + batch_len] + self.start_idx += batch_len + + if self.drop_last and len(indices) < self.batch_size: + self.reset() + return self.__next__() + + return indices + + def reset(self): + self.start_idx = 0 + if self.shuffle: + np.random.shuffle(self.sample_ids) + + +class _SamplerManager(object): + def __init__(self): + self.samplers = {} + + def get(self, sample_id, batch_size, num_samples, + shuffle=False, drop_last=False): + if sample_id in self.samplers: + return self.samplers[sample_id] + + sampler = _Sampler(batch_size, num_samples, + shuffle, drop_last) + self.samplers[sample_id] = sampler + return sampler + + +_sampler_manager = _SamplerManager() + + +def file_label_loader(data_root, indices, name=None): + """ + Reads a batch of data, outputs the bytes contents of a file + as a uint8 Tensor with one dimension. + + Args: + data_root (str): root directory of data + indices (list of int): batch indices of samples + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + """ + from paddle.vision.datasets import DatasetFolder + data_folder = DatasetFolder(data_root) + samples = [s[0] for s in data_folder.samples] + targets = [s[1] for s in data_folder.samples] + + if in_dygraph_mode(): + image = core.VarBase(core.VarDesc.VarType.UINT8, [], + unique_name.generate("file_label_loader"), + core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) + return _C_ops.file_label_loader(indices, image, 'files', + samples, 'labels', targets) + + inputs = {"Indices": indices} + attrs = { + 'files': samples, + 'labels': targets, + } + + helper = LayerHelper("file_label_loader", **locals()) + image = helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype='uint8') + + label = helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='int') + + helper.append_op( + type="file_label_loader", + inputs=inputs, + attrs=attrs, + outputs={"Image": image, + "Label": label}) + + return image, label + + +def file_label_reader(file_root, + batch_size=1, + shuffle=False, + drop_last=False): + """ + Reads and outputs the bytes contents of a file as a uint8 Tensor + with one dimension. + + Args: + filename (str): Path of the file to be read. + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + + Returns: + A uint8 tensor. + + Examples: + .. code-block:: python + + import cv2 + import paddle + + image = paddle.vision.ops.file_label_reader('/workspace/datasets/ILSVRC2012/val/', 2) + + """ + from paddle.vision.datasets import DatasetFolder + data_folder = DatasetFolder(file_root) + samples = [s[0] for s in data_folder.samples] + targets = [s[1] for s in data_folder.samples] + + if in_dygraph_mode(): + sample_id = utils._hash_with_id(file_root, batch_size, + shuffle, drop_last) + print("sample_id", sample_id) + sampler = _sampler_manager.get(sample_id, + batch_size=batch_size, + num_samples=len(samples), + shuffle=shuffle, + drop_last=drop_last) + indices = paddle.to_tensor(next(sampler), dtype='int64') + return file_label_loader(file_root, indices) + + def _reader(indices): + return file_label_loader(file_root, indices) + + return paddle.io.data_reader(_reader, + batch_size=batch_size, + num_samples=len(samples), + shuffle=shuffle, + drop_last=drop_last) + # inputs = dict() + # attrs = { + # 'root_dir': file_root, + # 'batch_size': batch_size, + # 'files': samples, + # 'labels': targets, + # 'reader_id': unq_reader_id, + # } + # + # helper = LayerHelper("file_label_reader", **locals()) + # out = helper.create_variable( + # name=unique_name.generate("file_label_reader"), + # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + # dtype='uint8') + # + # label = helper.create_variable( + # name=unique_name.generate("file_label_reader"), + # type=core.VarDesc.VarType.LOD_TENSOR, + # dtype='int') + # + # helper.append_op( + # type="file_label_reader", + # inputs=inputs, + # attrs=attrs, + # outputs={"Out": out, + # "Label": label + # }) + + return out, label + From f0d9d954ebad9e9f7d9566aa9f5b57c21a1c036a Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 24 Jan 2022 13:08:33 +0000 Subject: [PATCH 51/95] fix import _C_ops --- python/paddle/vision/reader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 928622395256be..3084b041df9909 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -19,6 +19,7 @@ import paddle from paddle.common_ops_import import * +from paddle import _C_ops __all__ = [ #noqa 'file_label_loader', From 9adaeab5fcc9803ef287a3051a2bb87d49881913 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 25 Jan 2022 06:23:19 +0000 Subject: [PATCH 52/95] refine api --- python/paddle/fluid/dataloader/ops.py | 7 +------ python/paddle/fluid/dataloader/pipeline.py | 5 +++-- python/paddle/fluid/reader.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 092c8db974a5ac..307ea41b4a664f 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -89,22 +89,17 @@ def map(map_func, inputs): name=unique_name.generate("map"), type=outp.desc.type(), persistable=True) for outp in program_outputs] - stream_id = _generate_stream_id() attrs = { "map_block": map_block, "program_id": program_id, "input_var_names": input_var_names, "output_var_names": output_var_names } - print("stream id: ", stream_id, "attr: ", attrs) - # stream_id = _generate_stream_id() + stream_id = _generate_stream_id() for idx in range(map_block.desc.op_size()): map_block.desc.op(idx)._set_attr('stream_id', stream_id) - import sys - sys.stdout.flush() - helper.append_op( type="map", inputs={"In": inputs}, diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 89eb8a773e813d..301f4d074836d4 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -66,6 +66,7 @@ def __exit__(self, exception_type, exception_value, traceback): local_rank = paddle.distributed.get_rank() paddle.disable_static("gpu:" + str(local_rank)) + print("main_program", self._main_program.block(0)) def set_outputs(self, outputs): if isinstance(outputs, Sequence): @@ -128,5 +129,5 @@ def __next__(self): def next(self): return self.__next__() - # def __del__(self): - # core._shutdown_dataloader() + def __del__(self): + core._shutdown_dataloader() diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 7fff15cc9807c6..f9de474cfa6679 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -437,7 +437,7 @@ def __call__(self): return self.__iter__() @staticmethod - def build_pipeline(func, *args, **kwargs): + def from_pipeline(func, *args, **kwargs): with Pipeline() as pipeline: outputs = func(*args, **kwargs) pipeline.set_outputs(outputs) From 6183815792317fe7a57032e42fa076bc62ea3927 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 25 Jan 2022 12:13:15 +0000 Subject: [PATCH 53/95] fix drop_last=False hang --- paddle/fluid/operators/data/map_runner.cc | 2 +- paddle/fluid/operators/data/nvjpeg_decoder.cc | 2 +- paddle/fluid/operators/split_lod_tensor_op.cc | 42 ++++++++++--------- python/paddle/fluid/dataloader/pipeline.py | 15 ++++++- python/paddle/vision/ops.py | 12 +++--- 5 files changed, 44 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 2c6cc0ab1ebd48..94cec0ca9f7f42 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -59,7 +59,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // If input queue closed, namely EOE(end of epoch) from // dataset reader to here, read failed auto queue = input_queues_[i]; - if (queue->IsClosed()) return false; + // if (queue->IsClosed()) return false; // read LoDTensorArray from queue bool success = true; diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 5255f5bc5a01ad..ce308c960cb55d 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -41,7 +41,7 @@ NvjpegDecoder::NvjpegDecoder(std::string mode, int dev_id) // create decode params, decoder and state PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_DEFAULT, &decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_)); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); // create device & pinned buffer diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index fe646b2830b66f..8a8558ad18364c 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -166,26 +166,28 @@ class SplitLoDTensorInferShape : public framework::InferShapeBase { OP_INOUT_CHECK(context->HasOutput("OutFalse"), "Output", "OutFalse", "SplitLoDTensor"); - auto mask_dim = context->GetInputDim("Mask"); - PADDLE_ENFORCE_EQ( - mask_dim.size(), 2, - platform::errors::InvalidArgument( - "If you are using IfElse OP:" - "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " - "ie.true_block():\n out_1 = ie.input(x)\n\n" - "Please ensure that the cond should be a 2-D tensor and " - "the second dim size of cond should be 1. " - "But now the cond's shape is [", - *mask_dim.Get(), "].\n")); - PADDLE_ENFORCE_EQ(mask_dim[1], 1, - platform::errors::InvalidArgument( - "If you are using IfElse OP:" - "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " - "ie.true_block():\n out_1 = ie.input(x)\n\n" - "Please ensure that the cond should be a 2-D tensor " - "and the second dim size of cond should be 1. " - "But now the cond's shape is [", - *mask_dim.Get(), "].\n")); + if (context->IsRuntime()) { + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ( + mask_dim.size(), 2, + platform::errors::InvalidArgument( + "If you are using IfElse OP:" + "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " + "ie.true_block():\n out_1 = ie.input(x)\n\n" + "Please ensure that the cond should be a 2-D tensor and " + "the second dim size of cond should be 1. " + "But now the cond's shape is [", + *mask_dim.Get(), "].\n")); + PADDLE_ENFORCE_EQ(mask_dim[1], 1, + platform::errors::InvalidArgument( + "If you are using IfElse OP:" + "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " + "ie.true_block():\n out_1 = ie.input(x)\n\n" + "Please ensure that the cond should be a 2-D tensor " + "and the second dim size of cond should be 1. " + "But now the cond's shape is [", + *mask_dim.Get(), "].\n")); + } context->SetOutputDim("OutTrue", context->GetInputDim("X")); context->SetOutputDim("OutFalse", context->GetInputDim("X")); diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 301f4d074836d4..945290e4dd2d45 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -30,6 +30,9 @@ CleanupFuncRegistrar.register(core._shutdown_dataloader) +AVAILABLE_OP_TYPES = ['data_reader', 'map'] + + class Pipeline: """ Data pipeline @@ -66,7 +69,17 @@ def __exit__(self, exception_type, exception_value, traceback): local_rank = paddle.distributed.get_rank() paddle.disable_static("gpu:" + str(local_rank)) - print("main_program", self._main_program.block(0)) + + self._check_op_type() + + def _check_op_type(self): + for op in self._main_program.block(0).ops: + if op.type not in ['data_reader', 'map']: + raise RuntimeError( + "pipeline given to DataLoader.from_pipeline should be " + "composed of reader OPs and map OP, other OPs(e.g. " + "decoder OPs or Paddle OPs) should be run under " + "paddle.io.map") def set_outputs(self, outputs): if isinstance(outputs, Sequence): diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index aa5011baaf1d8b..f4c356151df247 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1007,18 +1007,18 @@ def image_decode_random_crop(x, return out -def random_flip(x, batch_size, prob=0.5, name=None): +def random_flip(x, prob=0.5, name=None): + if prob < 0. or prob > 1.: + raise ValueError("prob should in (0, 1) in random_flip") + if in_dygraph_mode(): - p = np.random.uniform(0., 1., [batch_size]) + p = np.random.uniform(0., 1., x.shape[0:1]) for i in range(batch_size): if p[i] < prob: x[i] = paddle.flip(x[i], -1) return x - if prob < 0. or prob > 1.: - raise ValueError("prob should in (0, 1) in random_flip") - - p = paddle.uniform([batch_size, 1], min=0., max=1.) + p = paddle.uniform([layers.shape(x)[0], 1], min=0., max=1.) ie = layers.IfElse(p < prob) with ie.true_block(): out = ie.input(x) From 8668ac82ec590cd23f187b4d092e5454eef0dbdc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 26 Jan 2022 08:08:13 +0000 Subject: [PATCH 54/95] support program shutdown --- .../fluid/framework/ir/data_io_queue_pass.cc | 6 +- .../fluid/operators/data/batch_decode_op.cc | 6 +- .../fluid/operators/data/batch_decode_op.cu | 37 ++++-------- .../data/batch_decode_random_crop_op.cc | 4 ++ .../data/batch_decode_random_crop_op.cu | 32 ++-------- paddle/fluid/operators/data/data_reader_op.h | 9 ++- paddle/fluid/operators/data/map_op.h | 2 +- paddle/fluid/operators/data/nvjpeg_decoder.cc | 4 ++ paddle/fluid/operators/data/nvjpeg_decoder.h | 58 +++++++++++++++++++ paddle/fluid/operators/data/shutdown.h | 41 +++++++++++-- paddle/fluid/pybind/pybind.cc | 11 +++- python/paddle/fluid/core.py | 10 +++- python/paddle/fluid/dataloader/ops.py | 3 +- python/paddle/fluid/dataloader/pipeline.py | 22 ++++++- python/paddle/vision/ops.py | 11 ++-- python/paddle/vision/reader.py | 1 - 16 files changed, 177 insertions(+), 80 deletions(-) diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/data_io_queue_pass.cc index 8d9769da9ec681..d283e1edef1539 100644 --- a/paddle/fluid/framework/ir/data_io_queue_pass.cc +++ b/paddle/fluid/framework/ir/data_io_queue_pass.cc @@ -61,8 +61,7 @@ static void ProcessOutputQueueHolderOp(ir::Graph* graph) { if (n->IsVar() && n->Var()) { auto *var = n->Var(); if (var_names.find(var->Name()) != var_names.end()) { - // VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; - LOG(ERROR) << "Change output variable type of " << var->Name() << " to queue holder"; + VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; var->SetType(framework::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); var->SetPersistable(true); } @@ -88,8 +87,7 @@ static void ProcessInputArrayOp(ir::Graph* graph) { if (n->IsVar() && n->Var()) { auto *var = n->Var(); if (var_names.find(var->Name()) != var_names.end()) { - // VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; - LOG(ERROR) << "Change input variable type of " << var->Name() << " to array"; + VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; var->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); } } diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index d7c39f0aaf1c4d..ebe7908ac6e0f1 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -76,8 +76,12 @@ and 255. ",\"gray\" , \"rgb\" .") .SetDefault("unchanged"); AddAttr("local_rank", - "(int64_t)" + "(int)" "The index of the op to start execution"); + AddAttr("program_id", + "(int64_t)" + "The unique hash id used as cache key for " + "decode thread pool"); } }; diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index c8657d3a150838..c77b81f4ecd927 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -23,7 +23,7 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; -static NvjpegDecoderThreadPool* decode_pool = nullptr; +// static NvjpegDecoderThreadPool* decode_pool = nullptr; template class GPUBatchDecodeKernel : public framework::OpKernel { @@ -33,12 +33,16 @@ class GPUBatchDecodeKernel : public framework::OpKernel { LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); + auto program_id = ctx.Attr("program_id"); - // multi-phrase decode thread pool - if (!decode_pool) { - LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; - decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); - } + // // multi-phrase decode thread pool + // if (!decode_pool) { + // LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; + // decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); + // } + auto* decode_pool = + DecoderThreadPoolManager::Instance()->GetDecoderThreadPool( + program_id, num_threads, mode, local_rank); const framework::LoDTensorArray* inputs = ctx.Input("X"); @@ -63,27 +67,6 @@ class GPUBatchDecodeKernel : public framework::OpKernel { } decode_pool->RunAll(true); - // out_queue->Push(out_array); - - // // multi-phrase decode single thread - // if (!nvjpeg_decoder) { - // nvjpeg_decoder = new NvjpegDecoder(mode); - // } - // - // const framework::LoDTensorArray* inputs = - // ctx.Input("X"); - // - // auto* out = ctx.OutputVar("Out"); - // auto& out_array = *out->GetMutable(); - // out_array.resize(inputs->size()); - // - // for (size_t i = 0; i < inputs->size(); i++) { - // const framework::LoDTensor x = inputs->at(i); - // auto* x_data = x.data(); - // - // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), - // &out_array[i], &ctx); - // } LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 543b97ed91ad15..2ca56063936d14 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -137,6 +137,10 @@ and 255. AddAttr("area_min", "").SetDefault(0.08); AddAttr("area_max", "").SetDefault(1.); AddAttr("num_attempts", "").SetDefault(10); + AddAttr("program_id", + "(int64_t)" + "The unique hash id used as cache key for " + "decode thread pool"); } }; diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index 478ef5d676e966..728a861861bc31 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -33,12 +33,12 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { int num_threads = ctx.Attr("num_threads"); auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); + auto program_id = ctx.Attr("program_id"); + // multi-phrase decode thread pool - if (!decode_pool) { - LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; - decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); - // rand_seq = new std::seed_seq(static_cast(time(0))); - } + auto* decode_pool = + DecoderThreadPoolManager::Instance()->GetDecoderThreadPool( + program_id, num_threads, mode, local_rank); const framework::LoDTensorArray* inputs = ctx.Input("X"); @@ -74,33 +74,11 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { .roi_generator = new RandomROIGenerator( aspect_ratio_range, area_range, rands[i]), .place = dev - // .place = ctx.GetPlace() }; decode_pool->AddTask(std::make_shared(task)); } decode_pool->RunAll(true); - // out_queue->Push(out_array); - - // // multi-phrase decode single thread - // if (!nvjpeg_decoder) { - // nvjpeg_decoder = new NvjpegDecoder(mode); - // } - // - // const framework::LoDTensorArray* inputs = - // ctx.Input("X"); - // - // auto* out = ctx.OutputVar("Out"); - // auto& out_array = *out->GetMutable(); - // out_array.resize(inputs->size()); - // - // for (size_t i = 0; i < inputs->size(); i++) { - // const framework::LoDTensor x = inputs->at(i); - // auto* x_data = x.data(); - // - // nvjpeg_decoder->Run(x_data, static_cast(x.numel()), - // &out_array[i], &ctx); - // } LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 74c48c0a8b1d30..aaa29b79a00216 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -254,6 +254,13 @@ class ReaderManager { } } + void ShutDownReader(const int64_t reader_id) { + auto iter = id_to_reader_.find(reader_id); + if (iter != id_to_reader_.end()) { + iter->second->ShutDown(); + id_to_reader_.erase(reader_id); + } + } void ShutDown() { auto iter = id_to_reader_.begin(); while (iter != id_to_reader_.end()){ @@ -284,7 +291,7 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa if (queue == nullptr) { auto* holder = var->template GetMutable(); holder->InitOnce(capacity); - LOG(ERROR) << "DataLoaderOpKernel init queue" << holder->GetQueue(); + VLOG(1) << "DataLoaderOpKernel init queue" << holder->GetQueue(); } } else { VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index 2431b9e6a0932c..9ca34671f882cc 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -46,7 +46,7 @@ static void CheckAndInitOutputQueue(const std::vector& vars, int capa if (queue == nullptr) { auto* holder = var->template GetMutable(); holder->InitOnce(capacity); - LOG(ERROR) << "MapOpKernel init queue" << holder->GetQueue(); + VLOG(1) << "MapOpKernel init queue" << holder->GetQueue(); } } else { VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index ce308c960cb55d..e6f412fb9794ae 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -255,6 +255,10 @@ void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { } } +// initialization static variables out of MapRunnerManager +DecoderThreadPoolManager* DecoderThreadPoolManager::pm_instance_ptr_ = nullptr; +std::mutex DecoderThreadPoolManager::m_; + } // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h index 73a0beabb54f13..b84730bb1fd028 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -117,6 +117,64 @@ class NvjpegDecoderThreadPool { int outstand_tasks_; }; +class DecoderThreadPoolManager { + private: + DISABLE_COPY_AND_ASSIGN(DecoderThreadPoolManager); + + static DecoderThreadPoolManager *pm_instance_ptr_; + static std::mutex m_; + + std::map> prog_id_to_pool_; + + public: + static DecoderThreadPoolManager* Instance() { + if (pm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); + if (pm_instance_ptr_ == nullptr) { + pm_instance_ptr_ = new DecoderThreadPoolManager; + } + } + return pm_instance_ptr_; + } + + NvjpegDecoderThreadPool* GetDecoderThreadPool( + const int64_t program_id, const int num_threads, + const std::string mode, const int dev_id) { + auto iter = prog_id_to_pool_.find(program_id); + if (iter == prog_id_to_pool_.end()) { + prog_id_to_pool_[program_id] = + std::unique_ptr( + new NvjpegDecoderThreadPool(num_threads, mode, dev_id)); + } + return prog_id_to_pool_[program_id].get(); + } + + void ShutDownDecoder(const int64_t program_id) { + auto iter = prog_id_to_pool_.find(program_id); + if (iter != prog_id_to_pool_.end()) { + iter->second.get()->ShutDown(); + prog_id_to_pool_.erase(program_id); + } + } + + void ShutDown() { + if (prog_id_to_pool_.empty()) return; + + std::lock_guard lk(m_); + auto iter = prog_id_to_pool_.begin(); + for (; iter != prog_id_to_pool_.end(); iter++) { + if (iter->second.get()) iter->second.get()->ShutDown(); + } + } + + DecoderThreadPoolManager() { VLOG(1) << "DecoderThreadPoolManager init"; } + + ~DecoderThreadPoolManager() { + VLOG(1) << "~DecoderThreadPoolManager"; + ShutDown(); + } +}; + } // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 222b4c31bb50b3..15ac57d3bdd5f8 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -16,6 +16,7 @@ #include "paddle/fluid/operators/data/data_reader_op.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" +#include "paddle/fluid/operators/data/pipeline.h" namespace paddle { @@ -24,21 +25,49 @@ namespace data { extern NvjpegDecoderThreadPool* decode_pool; -void ShutDownDataLoader() { - LOG(ERROR) << "ShutDownDataLoader enter"; +void ShutDownAllDataLoaders() { + LOG(ERROR) << "ShutDownAllDataLoaders enter"; // step 1: shutdown reader ReaderManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownDataLoader reader_wrapper shutdown finish"; + LOG(ERROR) << "ShutDownAllDataLoaders reader_wrapper shutdown finish"; // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); - LOG(ERROR) << "ShutDownDataLoader decode_pool shutdown finish"; + LOG(ERROR) << "ShutDownAllDataLoaders decode_pool shutdown finish"; // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownDataLoader MapRunner shutdown finish"; + LOG(ERROR) << "ShutDownAllDataLoaders MapRunner shutdown finish"; + + // step 3: shutdown Pipeline + PipelineManager::Instance()->ShutDown(); + LOG(ERROR) << "ShutDownAllDataLoaders Pipeline shutdown finish"; +} + +void ShutDownReadersAndDecoders(const int64_t program_id) { + LOG(ERROR) << "ShutDownReadersAndDecoders enter, program_id: " << program_id; + // step 1: shutdown reader + ReaderManager::Instance()->ShutDownReader(program_id); + + // step 2: shutdown decoder + DecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); + LOG(ERROR) << "ShutDownReadersAndDecoders finish"; +} + +void ShutDownMaps(const std::vector program_ids) { + LOG(ERROR) << "ShutDownMaps enter, maps size: " << program_ids.size(); + for (auto& program_id : program_ids) { + MapRunnerManager::Instance()->ShutDownMapRunner(program_id); + } + LOG(ERROR) << "ShutDownMaps finish"; } -} // namespace data +void ShutDownPipeline(const int64_t program_id) { + LOG(ERROR) << "ShutDownPipeline program_id " << program_id << " enter"; + PipelineManager::Instance()->ShutDownPipeline(program_id); + LOG(ERROR) << "ShutDownPipeline program_id " << program_id << " finish"; +} + +} // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 498b7d607174af..1df10e56212068 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -682,8 +682,15 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); - m.def("_shutdown_dataloader", - &paddle::operators::data::ShutDownDataLoader); + m.def("_shutdown_all_dataloaders", + &paddle::operators::data::ShutDownAllDataLoaders); + m.def("_shutdown_readers_and_decoders", + &paddle::operators::data::ShutDownReadersAndDecoders); + m.def("_shutdown_maps", [](const std::vector program_ids) { + paddle::operators::data::ShutDownMaps(program_ids); + }); + m.def("_shutdown_pipeline", + &paddle::operators::data::ShutDownPipeline); BindImperative(&m); diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 5e98e496e200e8..1d38c447e78159 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -277,7 +277,10 @@ def to_list(s): from .core_avx import _device_synchronize from .core_avx import _get_current_stream from .core_avx import _set_current_stream - from .core_avx import _shutdown_dataloader + from .core_avx import _shutdown_all_dataloaders + from .core_avx import _shutdown_readers_and_decoders + from .core_avx import _shutdown_maps + from .core_avx import _shutdown_pipeline if sys.platform != 'win32': from .core_avx import _set_process_pids from .core_avx import _erase_process_pids @@ -331,7 +334,10 @@ def to_list(s): from .core_noavx import _device_synchronize from .core_noavx import _get_current_stream from .core_noavx import _set_current_stream - from .core_noavx import _shutdown_dataloader + from .core_noavx import _shutdown_all_dataloaders + from .core_noavx import _shutdown_readers_and_decoders + from .core_noavx import _shutdown_maps + from .core_noavx import _shutdown_pipeline if sys.platform != 'win32': from .core_noavx import _set_process_pids from .core_noavx import _erase_process_pids diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 307ea41b4a664f..4b2a569a8bec1c 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -121,7 +121,6 @@ def data_reader(reader_func, # build reader block main_program = helper.main_program with _ProgramGuard(main_program): - reader_id= _hash_with_id(main_program, reader_func) reader_block = main_program.current_block() indices_var = reader_block.create_var( @@ -142,7 +141,7 @@ def data_reader(reader_func, persistable=True) for outp in program_outputs] attrs = { - "reader_id": reader_id, + "reader_id": _hash_with_id(main_program), "reader_block": reader_block, "indices_var_name": indices_var_name, "output_var_names": output_var_names, diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 945290e4dd2d45..46dda1d6dfca83 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -27,7 +27,7 @@ __all__ = ["Pipeline"] -CleanupFuncRegistrar.register(core._shutdown_dataloader) +CleanupFuncRegistrar.register(core._shutdown_all_dataloaders) AVAILABLE_OP_TYPES = ['data_reader', 'map'] @@ -47,6 +47,8 @@ def __init__(self, queue_depth=2): self._queue_depth = queue_depth self._init_programs() + self.is_shutdown = False + def _init_programs(self): self._main_program = fluid.Program() self._startup_program = fluid.Program() @@ -142,5 +144,21 @@ def __next__(self): def next(self): return self.__next__() + def shutdown(self): + if not self.is_shutdown: + try: + program_id = _hash_with_id(self._main_program) + core._shutdown_readers_and_decoders(program_id) + + map_program_ids = [] + for op in self._main_program.block(0).ops: + if op.type == "map": + map_program_ids.append(op.attrs['program_id']) + core._shutdown_maps(program_id) + + core._shutdown_pipeline(program_id) + finally: + self.is_shutdown = True + def __del__(self): - core._shutdown_dataloader() + self.shutdown() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index f4c356151df247..61f1ef3c372db5 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -15,7 +15,7 @@ import numpy as np from ..fluid.layer_helper import LayerHelper, unique_name from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype -from ..fluid import core, layers +from ..fluid import core, layers, default_main_program from ..fluid.layers import nn, utils from ..nn import Layer from ..fluid.initializer import Normal @@ -907,14 +907,16 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): out = core.VarBase(core.VarDesc.VarType.UINT8, [], unique_name.generate("image_decode"), core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) + program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode( x, out, "mode", mode, "num_threads", num_threads, - "local_rank", local_rank) + "local_rank", local_rank, "program_id", program_id) inputs = {'X': x} attrs = {"mode": mode, "num_threads": num_threads, - "local_rank": local_rank} + "local_rank": local_rank, + "program_id": utils._hash_with_id(default_main_program())} helper = LayerHelper("batch_decode", **locals()) out = helper.create_variable( @@ -993,7 +995,8 @@ def image_decode_random_crop(x, "area_min": area_min, "area_max": area_max, "num_attempts": num_attempts, - "local_rank": local_rank} + "local_rank": local_rank, + "program_id": utils._hash_with_id(default_main_program())} helper = LayerHelper("batch_decode_random_crop", **locals()) out = helper.create_variable( diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 3084b041df9909..b4313b439ebdf9 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -163,7 +163,6 @@ def file_label_reader(file_root, if in_dygraph_mode(): sample_id = utils._hash_with_id(file_root, batch_size, shuffle, drop_last) - print("sample_id", sample_id) sampler = _sampler_manager.get(sample_id, batch_size=batch_size, num_samples=len(samples), From 71ee11cf9485fc1826e330a09062e33945655666 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 26 Jan 2022 08:17:17 +0000 Subject: [PATCH 55/95] fix dygraph error --- python/paddle/vision/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 61f1ef3c372db5..53b26bc56a5a87 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -980,12 +980,14 @@ def image_decode_random_crop(x, out = core.VarBase(core.VarDesc.VarType.UINT8, [], unique_name.generate("image_decode_random_crop"), core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) + program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode_random_crop( x, out, "mode", mode, "num_threads", num_threads, "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, "area_max", area_max, - "num_attempts", num_attempts, "local_rank", local_rank) + "num_attempts", num_attempts, "local_rank", local_rank, + "program_id", program_id) inputs = {'X': x} attrs = {"mode": mode, From 72922e6a2023630b17a456eb3e697d7e532ded1b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 26 Jan 2022 15:13:08 +0000 Subject: [PATCH 56/95] refine shutdown --- paddle/fluid/operators/data/data_reader_op.h | 3 +-- paddle/fluid/operators/data/map_runner.cc | 2 +- paddle/fluid/operators/data/nvjpeg_decoder.cc | 4 ++-- python/paddle/fluid/dataloader/pipeline.py | 6 +++--- python/paddle/vision/ops.py | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index aaa29b79a00216..1532d3e2c6d8e7 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -172,8 +172,7 @@ class DataReader { void ShutDown() { for(auto& queue: output_queues_) { - while (queue->Size()) sleep(0.5); - queue->Close(); + if (queue && !queue->IsClosed()) queue->Close(); } running_.store(false); diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 94cec0ca9f7f42..e6bc75802d9de8 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -206,7 +206,7 @@ void MapRunner::ShutDown() { running_.store(false); for (auto queue : output_queues_) { - if(queue) queue->Close(); + if(queue && !queue->IsClosed()) queue->Close(); } } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index e6f412fb9794ae..d0e883d65dddf8 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -211,11 +211,11 @@ void NvjpegDecoderThreadPool::ShutDown() { running_cond_.notify_all(); lock.unlock(); + task_queue_.clear(); + for (auto& thread : threads_) { if (thread.joinable()) thread.join(); } - - task_queue_.clear(); } void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 46dda1d6dfca83..f2da5b12102f7f 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -152,9 +152,9 @@ def shutdown(self): map_program_ids = [] for op in self._main_program.block(0).ops: - if op.type == "map": - map_program_ids.append(op.attrs['program_id']) - core._shutdown_maps(program_id) + if op.type == "map" and op.has_attr('program_id'): + map_program_ids.append(op.attr('program_id')) + core._shutdown_maps(map_program_ids) core._shutdown_pipeline(program_id) finally: diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 53b26bc56a5a87..5faa991d6b576b 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1018,7 +1018,7 @@ def random_flip(x, prob=0.5, name=None): if in_dygraph_mode(): p = np.random.uniform(0., 1., x.shape[0:1]) - for i in range(batch_size): + for i in range(x.shape[0]): if p[i] < prob: x[i] = paddle.flip(x[i], -1) return x From 6ed135c822b9b113214254831caffc917799571d Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Thu, 27 Jan 2022 06:42:40 +0000 Subject: [PATCH 57/95] opencv --- CMakeLists.txt | 16 +++ cmake/generic.cmake | 2 + cmake/third_party.cmake | 10 ++ paddle/fluid/operators/data/CMakeLists.txt | 6 +- .../operators/data/file_label_reader_op.h | 4 + paddle/fluid/operators/data/nvjpeg_decoder.cc | 104 ++++++++++++++++-- paddle/fluid/operators/data/nvjpeg_decoder.h | 7 +- 7 files changed, 138 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 334a6cfcd0ee14..c56ff15b95a039 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,7 @@ option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cu option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF) option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) +option(WITH_OPENCV "Compile with pocketfft support" ON) # PY_VERSION if(NOT PY_VERSION) @@ -336,6 +337,21 @@ include(third_party) # download, build, install third_party, Contains about 20+ include(flags) # set paddle compile flags +if(WITH_OPENCV) + find_package(OpenCV 4.0 QUIET COMPONENTS core imgproc imgcodecs) + if(NOT OpenCV_FOUND) + find_package(OpenCV 3.0 REQUIRED COMPONENTS core imgproc imgcodecs) + endif() + message(STATUS "Found OpenCV: ${OpenCV_INCLUDE_DIRS} (found suitable version \"${OpenCV_VERSION}\", minimum required is \"3.0\")") + include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) + message("DEBUGGGG_OPENCV") + include_directories(${OpenCV_INCLUDE_DIRS}) + link_directories(${OpenCV_LIBS}) + # target_link_libraries(paddle ${OpenCV_LIBS}) + add_definitions(-DPADDLE_WITH_OPENCV) + # target_link_libraries(paddle ${OpenCV_LIBS} ) +endif() + if(WITH_PROFILER) find_package(Gperftools REQUIRED) include_directories(${GPERFTOOLS_INCLUDE_DIR}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 2004abcbfa1f22..5c08478a28a6dd 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -134,6 +134,8 @@ function(common_link TARGET_NAME) if (WITH_PROFILER) target_link_libraries(${TARGET_NAME} gperftools::profiler) endif() + + endfunction() # find all third_party modules is used for paddle static library diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 7df70477682bb1..0d9c9429d45ddc 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -378,4 +378,14 @@ if (WITH_POCKETFFT) add_definitions(-DPADDLE_WITH_POCKETFFT) endif (WITH_POCKETFFT) +# if (OPENCV_FOUND) +# # include(${OpenCV_INCLUDE_DIRS}) +# # list(APPEND third_party_deps ${OpenCV_LIBS}) +# # add_definitions(-DPADDLE_WITH_OPENCV) +# message("DEBUGGGG_OPENCV") +# include_directories(${OpenCV_INCLUDE_DIRS}) +# link_directories(${OpenCV_LIBS}) +# add_definitions(-DPADDLE_WITH_OPENCV) +# endif (OPENCV_FOUND) + add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 0af5a7d2334acf..c6bc165c3fa640 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -4,6 +4,10 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() +# find_package(ZLIB) +# include_directories(${ZLIB_INCLUDE_DIRS}) +# TARGET_LINK_LIBRARIES( ${ZLIB_LIBRARIES}) + cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope) op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) @@ -11,7 +15,7 @@ cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpoo op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) -cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) +cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS} ${OpenCV_LIBS}) op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/operators/data/file_label_reader_op.h b/paddle/fluid/operators/data/file_label_reader_op.h index a0e70455ce8663..34e2252d256b98 100644 --- a/paddle/fluid/operators/data/file_label_reader_op.h +++ b/paddle/fluid/operators/data/file_label_reader_op.h @@ -108,6 +108,7 @@ class FileDataReader { // iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); auto total_batch_size = batch_size_ * world_size_; iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; + std::cout << "DEBUGGGGG iters per epoch: " << iters_per_epoch_ << std::endl; is_closed_ = false; for (int i = 0, n = files.size(); i < n; i++) image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); @@ -177,6 +178,9 @@ class FileDataReader { is_closed_.store(true); break; } + else{ + std::cout << "index / size: " << i << " / " << image_label_pairs_.size() << std::endl; + } i %= image_label_pairs_.size(); framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); ret.push_back(std::move(tmp)); diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 931fb1411085cf..c6a6ae27fec32d 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -74,7 +74,71 @@ NvjpegDecoder::~NvjpegDecoder() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); } -void NvjpegDecoder::ParseDecodeParams( +// cv::Mat DecodeRandomCropResize(const unsigned char* data, size_t length, +// RandomROIGenerator* roi_generator, +// unsigned char* workspace, size_t workspace_size, +// unsigned char* dst, int target_width, +// int target_height) { +void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length, + RandomROIGenerator* roi_generator, + unsigned char* workspace, size_t workspace_size, + framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) { + cv::Mat image = + cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); + cv::Mat cropped; + int height; + int width; + if (roi_generator) { + ROI roi; + roi_generator->GenerateRandomROI(image.cols, image.rows, &roi); + cv::Rect cv_roi; + cv_roi.x = roi.x; + cv_roi.y = roi.y; + cv_roi.width = roi.w; + cv_roi.height = roi.h; + // PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + // height = roi.h; + // width = roi.w; + // } + // cv::Rect roi; + // roi_generator->GenerateRandomROI(image.cols, image.rows, &roi.x, &roi.y, &roi.width, + // &roi.height); + height = roi.h; + width = roi.w; + std::vector out_shape = {3, height, width}; + temp.Resize(framework::make_ddim(out_shape)); + platform::CPUPlace cpu; + // allocate memory and assign to out_image + auto* data = temp.mutable_data(cpu); + cropped.data = data; + image(cv_roi).copyTo(cropped); + out->Resize(framework::make_ddim(out_shape)); + // auto* data = temp.mutable_data(cpu); + + TensorCopySync(temp, place, out); + // cropped = image; + + } else { + cropped = image; + } + + // std::vector out_shape = {3, height, width}; + // temp.Resize(framework::make_ddim(out_shape)); + // platform::CPUPlace cpu; + // // allocate memory and assign to out_image + // auto* data = temp.mutable_data(cpu); + // data = cropped.data; + // out->Resize(framework::make_ddim(out_shape)); + // // auto* data = temp.mutable_data(cpu); + // TensorCopySync(temp, place, out); + // return cropped; + // cv::Mat resized; + // cv::resize(cropped, resized, cv::Size(target_width, target_height), 0, 0, cv::INTER_LINEAR); + // cv::Mat dst_mat(target_height, target_width, CV_8UC3, dst, cv::Mat::AUTO_STEP); + // cv::cvtColor(resized, dst_mat, cv::COLOR_BGR2RGB); +} + +int NvjpegDecoder::ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, platform::Place place) { @@ -83,10 +147,26 @@ void NvjpegDecoder::ParseDecodeParams( int widths[NVJPEG_MAX_COMPONENT]; int heights[NVJPEG_MAX_COMPONENT]; - PADDLE_ENFORCE_NVJPEG_SUCCESS( - platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, - &components, &subsampling, widths, heights)); - + + nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, + &components, &subsampling, widths, heights); + // PADDLE_ENFORCE_NVJPEG_SUCCESS( + // platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, + // &components, &subsampling, widths, heights)); + if (status != NVJPEG_STATUS_SUCCESS || (components != 3 && components != 1)) { + framework::LoDTensor temp; + CPUDecodeRandomCropResize(bit_stream, bit_len, roi_generator, nullptr, 0, temp, out, place); + return 1; + + // CHECK_LE(target_width * target_height * kNumChannels, fallback_buffer_size_); + // fallback_handle_.DecodeRandomCropResize(data, length, crop_generator, nullptr, 0, + // fallback_buffer_, target_width, target_height); + // OF_CUDA_CHECK(cudaMemcpyAsync(dst, fallback_buffer_, + // target_width * target_height * kNumChannels, cudaMemcpyDefault, + // cuda_stream_)); + // return; + } + int64_t width = static_cast(widths[0]); int64_t height = static_cast(heights[0]); @@ -102,7 +182,7 @@ void NvjpegDecoder::ParseDecodeParams( output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); + "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); } } else if (mode_ == "gray") { output_format = NVJPEG_OUTPUT_Y; @@ -111,8 +191,9 @@ void NvjpegDecoder::ParseDecodeParams( output_format = NVJPEG_OUTPUT_RGBI; output_components = 3; } else { + // std::cout << mode_ << std::endl; PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU")); + "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); } PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); @@ -133,6 +214,7 @@ void NvjpegDecoder::ParseDecodeParams( auto* data = out->mutable_data(place); out_image->channel[0] = data; out_image->pitch[0] = output_components * width; + return 0; } void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { @@ -143,7 +225,8 @@ void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImag // decode jpeg in host to pinned buffer PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); + + (platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); // transfer and decode to device buffer PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); @@ -157,7 +240,10 @@ void NvjpegDecoder::Run( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, platform::Place& place) { nvjpegImage_t image; - ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); + int res = ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); + if (res) { + return; + } Decode(bit_stream, bit_len, &image); } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/nvjpeg_decoder.h index 73a0beabb54f13..b8074d880d34ac 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/nvjpeg_decoder.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/gpu_info.h" @@ -53,8 +54,12 @@ class NvjpegDecoder { private: DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); + void CPUDecodeRandomCropResize(const uint8_t* data, size_t length, + RandomROIGenerator* roi_generator, + unsigned char* workspace, size_t workspace_size, + framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place); - void ParseDecodeParams( + int ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, platform::Place place); From b459b82f4b98e2ca25174e6e8daaae9cc4cd6904 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 28 Jan 2022 13:39:37 +0000 Subject: [PATCH 58/95] fix train speed --- .../data/batch_decode_random_crop_op.cu | 2 +- paddle/fluid/operators/data/data_reader_op.cc | 1 + .../operators/data/file_label_loader_op.cc | 36 ++--- .../operators/data/file_label_loader_op.h | 145 +++++++++++++++++- python/paddle/vision/reader.py | 15 +- 5 files changed, 165 insertions(+), 34 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index 728a861861bc31..eecf5da9bed9ca 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -42,7 +42,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const framework::LoDTensorArray* inputs = ctx.Input("X"); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << inputs->size(); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << inputs->size() << ", program_id: " << program_id; auto* out = ctx.OutputVar("Out"); auto dev = platform::CUDAPlace(local_rank); diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 6cfbd0d12f9c33..67eeb641911f60 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -63,6 +63,7 @@ class DataReaderOp : public framework::OperatorBase { auto output_var_names = Attr>("output_var_names"); auto* reader_block = Attr("reader_block"); auto reader_id = Attr("reader_id"); + LOG(ERROR) << "DataReaderOp enter, reader_id: " << reader_id; auto output_queues = GetQueueVecFromVariableVec(output_vars); ReaderManager::Instance()->StartDataReader( diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc index d63de3e471f9ff..0081891c6f8b09 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.cc +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -17,11 +17,6 @@ namespace paddle { namespace operators { namespace data { -// FileDataReaderWrapper reader_wrapper; - -// // initialization static variables out of ReaderManager -// ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; -// std::mutex ReaderManager::m_; class FileLabelLoaderOp : public framework::OperatorWithKernel { public: @@ -44,17 +39,17 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "Input(Indices) should be a 1-D Tensor")); - auto files = ctx->Attrs().Get>("files"); - auto labels = ctx->Attrs().Get>("labels"); - PADDLE_ENFORCE_GT(files.size(), 0, - platform::errors::InvalidArgument( - "length of files should be greater than 0")); - PADDLE_ENFORCE_GT(labels.size(), 0, - platform::errors::InvalidArgument( - "length of labels should be greater than 0")); - PADDLE_ENFORCE_EQ(files.size(), labels.size(), - platform::errors::InvalidArgument( - "length of labels and files should be equal")); + // auto files = ctx->Attrs().Get>("files"); + // auto labels = ctx->Attrs().Get>("labels"); + // PADDLE_ENFORCE_GT(files.size(), 0, + // platform::errors::InvalidArgument( + // "length of files should be greater than 0")); + // PADDLE_ENFORCE_GT(labels.size(), 0, + // platform::errors::InvalidArgument( + // "length of labels should be greater than 0")); + // PADDLE_ENFORCE_EQ(files.size(), labels.size(), + // platform::errors::InvalidArgument( + // "length of labels and files should be equal")); } framework::OpKernelType GetExpectedKernelType( @@ -113,10 +108,11 @@ class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Indices", "The batch indices of input samples"); AddOutput("Image", "The output image tensor of ReadFileLoader op"); AddOutput("Label", "The output label tensor of ReadFileLoader op"); - AddAttr>("files", "Path of the file to be readed.") - .SetDefault({}); - AddAttr>("labels", "Path of the file to be readed.") - .SetDefault({}); + AddAttr("data_root", "Path of root directory of dataset"); + // AddAttr>("files", "Path of the file to be readed.") + // .SetDefault({}); + // AddAttr>("labels", "Path of the file to be readed.") + // .SetDefault({}); AddComment(R"DOC( This operator read a file. )DOC"); diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index b8db6be900fb70..bbc34a7b546bf1 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -15,7 +15,10 @@ #pragma once #include #include +#include #include +#include +#include #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -30,6 +33,136 @@ namespace data { using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; +// static void ParseClasses(const std::string data_root, +// std::vector* classes) { +// _finddata_t findData; +// auto handle = _findfirst(data_root, &findData); +// PADDLE_ENFORCE_NE(handle, -1, platform::errors::InvalidArgument( +// "Cannot find files under data_root")); +// +// do { +// if (findData.attrib & _A_SUBDIRi && findData.name != "." +// && findData.name != "..") { +// classes->emplace_back(findData.name); +// } +// } while (_findnext(handle, &findData) == 0); +// +// std::sort(classes->begin(), classes->end()); +// for (size_t i = 0; i < classes->size(); i++) { +// LOG(ERROR) << "class id " << i << ": " << classes->at(i); +// } +// } + +// static void ParseFilesAndLabels(const std::string data_root, +// std::vector* files, +// std::vector labels) { +// std::vector classes; +// ParseClasses(data_root, &classes); +// +// _finddata_t findData; +// for (int i = 0; i < static_cast(classes.size()); i++) { +// auto cls_dir = data_root + "/" + classes[i]; +// auto handle = _findfirst(cls_dir, &findData); +// if (handle == -1) break; +// +// do { +// if (findData.name == "." || findData.name == "..") continue; +// files->emplace_back(cls_dir + "/" + findData.name); +// labels->emplace_back(i); +// } +// } +// } + +#ifdef _WIN32 +constexpr char DIR_SEP = '\\'; +#else +constexpr char DIR_SEP = '/'; +#endif + +static std::string JoinPath(const std::string path1, + const std::string path2) { + // empty check + if (path1.empty()) return path2; + if (path1.empty()) return path1; + + // absolute path check + if (path2[0] == DIR_SEP) return path2; +#ifdef _WIN32 + if (path2[1] == ":") return path2; +#endif + + // concat path + if (path1[path1.length() - 1] == DIR_SEP) return path1 + path2; + return path1 + DIR_SEP + path2; +} + +static void ParseFilesAndLabels(const std::string data_root, + std::vector>* samples) { + auto* dir = opendir(data_root.c_str()); + PADDLE_ENFORCE_NE(dir, nullptr, platform::errors::InvalidArgument( + "Cannot open directory %s", data_root)); + + // Step 1: parse classes info + std::vector classes; + auto* entry = readdir(dir); + while (entry) { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { + entry = readdir(dir); + continue; + } + + auto cls_path = JoinPath(data_root, entry->d_name); + struct stat s; + int ret = stat(cls_path.c_str(), &s); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::InvalidArgument( + "Directory %s is unaccessiable.", cls_path)); + + if (S_ISDIR(s.st_mode)) classes.emplace_back(entry->d_name); + + entry = readdir(dir); + } + + closedir(dir); + + // sort directories in alphabetic order to generate class order + std::sort(classes.begin(), classes.end()); + + // Step 2: traverse directory to generate samples + for (int class_id = 0; class_id < static_cast(classes.size()); + class_id++) { + auto cur_dir = data_root + DIR_SEP + classes[class_id]; + dir = opendir(cur_dir.c_str()); + entry = readdir(dir); + while (entry) { + if (strcmp(entry->d_name, ".") == 0 + || strcmp(entry->d_name, "..") == 0) { + entry = readdir(dir); + continue; + } + + auto file = cur_dir + DIR_SEP + entry->d_name; + samples->emplace_back(std::make_pair(file, class_id)); + + entry = readdir(dir); + } + closedir(dir); + } +} + +std::map>> root_to_samples_; + +static std::vector>* GetFilesAndLabelsFromCache(const std::string data_root) { + auto iter = root_to_samples_.find(data_root); + if (iter == root_to_samples_.end()) { + std::vector> samples; + ParseFilesAndLabels(data_root, &samples); + LOG(ERROR) << "Init samples: " << samples.size(); + root_to_samples_[data_root] = samples; + } + + return &(root_to_samples_[data_root]); +} + // class FileDataReader { // public: // explicit FileDataReader(const framework::ExecutionContext& ctx, @@ -223,8 +356,8 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { auto* image_arr = ctx.Output("Image"); auto* label_tensor = ctx.Output("Label"); - auto files = ctx.Attr>("files"); - auto labels = ctx.Attr>("labels"); + auto data_root = ctx.Attr("data_root"); + auto* samples = GetFilesAndLabelsFromCache(data_root); auto batch_size = indices->dims()[0]; const int64_t* indices_data = indices->data(); @@ -235,8 +368,10 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { framework::make_ddim({static_cast(batch_size)})); auto* label_data = label_tensor->mutable_data(platform::CPUPlace()); for (int64_t i = 0; i < batch_size; i++) { - int64_t index = indices_data[i]; - std::ifstream input(files[index].c_str(), + int64_t index = static_cast(indices_data[i]); + auto file = samples->at(index).first; + auto label = samples->at(index).second; + std::ifstream input(file.c_str(), std::ios::in | std::ios::binary | std::ios::ate); std::streamsize file_size = input.tellg(); @@ -251,7 +386,7 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { input.read(reinterpret_cast(data), file_size); image_arr->emplace_back(image); - label_data[i] = labels[index]; + label_data[i] = label; } LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index b4313b439ebdf9..1cd2ac9ab8d2fd 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -90,22 +90,21 @@ def file_label_loader(data_root, indices, name=None): need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. """ - from paddle.vision.datasets import DatasetFolder - data_folder = DatasetFolder(data_root) - samples = [s[0] for s in data_folder.samples] - targets = [s[1] for s in data_folder.samples] + # from paddle.vision.datasets import DatasetFolder + # data_folder = DatasetFolder(data_root) + # samples = [s[0] for s in data_folder.samples] + # targets = [s[1] for s in data_folder.samples] if in_dygraph_mode(): image = core.VarBase(core.VarDesc.VarType.UINT8, [], unique_name.generate("file_label_loader"), core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) - return _C_ops.file_label_loader(indices, image, 'files', - samples, 'labels', targets) + return _C_ops.file_label_loader(indices, image, 'data_root', + data_root) inputs = {"Indices": indices} attrs = { - 'files': samples, - 'labels': targets, + 'data_root': data_root, } helper = LayerHelper("file_label_loader", **locals()) From a30b9fb4fc765ef149b46f2d61fb5efbde449fd0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 13 Feb 2022 14:46:31 +0000 Subject: [PATCH 59/95] add random_flip op --- paddle/fluid/operators/data/CMakeLists.txt | 3 + paddle/fluid/operators/data/data_reader_op.h | 2 + paddle/fluid/operators/data/random_flip_op.cc | 90 ++++++++++++++++++ paddle/fluid/operators/data/random_flip_op.h | 86 +++++++++++++++++ .../operators/data/unity_build_rule.cmake | 3 +- paddle/fluid/operators/flip_op.cc | 95 ++++++++++--------- python/paddle/fluid/dataloader/pipeline.py | 20 ++-- python/paddle/vision/ops.py | 21 +++- 8 files changed, 262 insertions(+), 58 deletions(-) create mode 100644 paddle/fluid/operators/data/random_flip_op.cc create mode 100644 paddle/fluid/operators/data/random_flip_op.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index bf6470bd02df3f..915bda52a4de69 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -23,8 +23,11 @@ op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpe op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) + op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) +op_library(random_flip_op SRCS random_flip_op.cc DEPS ${OP_HEADER_DEPS}) + # register_operators() # TODO: add test here diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 1532d3e2c6d8e7..daa41182722328 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -49,6 +49,7 @@ class Sampler { drop_last_(drop_last), rank_(rank), world_size_(world_size) { + LOG(ERROR) << "Sampler num_samples " << num_samples; sample_ids_.reserve(num_samples); for (int64_t i = 0; i < num_samples; i++) { sample_ids_.emplace_back(i); @@ -125,6 +126,7 @@ class DataReader { sampler_.GetNextIndices(&indices); // shutdown reader if indices drained if (indices.size() == 0) { + LOG(ERROR) << "DataReader indices drained"; for(auto& queue: output_queues_) { while (queue->Size()) sleep(0.5); queue->Close(); diff --git a/paddle/fluid/operators/data/random_flip_op.cc b/paddle/fluid/operators/data/random_flip_op.cc new file mode 100644 index 00000000000000..3575c002bc0a9c --- /dev/null +++ b/paddle/fluid/operators/data/random_flip_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/data/random_flip_op.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +namespace data { + +using framework::OpKernelType; +using framework::Tensor; + +class RandomFlipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::NotFound("Input(X) of RandomFlipOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::NotFound( + "Output(Out) of RandomFlipOp should not be null.")); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], 1})); + ctx->ShareLoD("X", "Out"); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(input_data_type, + platform::CPUPlace()); + } +}; + +class RandomFlipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of flip op."); + AddOutput("Out", "(Tensor), The output tensor in shape of [N, 1], N is " + "the batch size of X, bool data indicates whether to " + "perform flip in this sample."); + AddAttr("probability", "The probability to flip each sample.") + .SetDefault(0.5); + AddAttr("seed", "The seed for uniform random generator") + .SetDefault(0); + AddComment(R"DOC( + Random Flip Operator. + )DOC"); + } +}; + +class RandomFlipOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Out"}}; + return m; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::data; +namespace plat = paddle::platform; +REGISTER_OPERATOR(random_flip, ops::RandomFlipOp, ops::RandomFlipOpMaker, ops::RandomFlipOpInferVarType); + +REGISTER_OP_CPU_KERNEL( + random_flip, ops::RandomFlipCPUKernel, + ops::RandomFlipCPUKernel, + ops::RandomFlipCPUKernel); diff --git a/paddle/fluid/operators/data/random_flip_op.h b/paddle/fluid/operators/data/random_flip_op.h new file mode 100644 index 00000000000000..e8f31e1fe69c28 --- /dev/null +++ b/paddle/fluid/operators/data/random_flip_op.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace data { + +using Tensor = framework::Tensor; + +constexpr size_t dim_bitset_size = 64; + +class RandomFlipGenerator { + public: + RandomFlipGenerator(int seed, float prob) + : distribution_(prob), + seed_(seed) { + if (seed != 0) rng_.seed(seed); + else rng_.seed(time(0)); + } + + ~RandomFlipGenerator() = default; + + bool Generate() { return distribution_(rng_); } + + private: + std::bernoulli_distribution distribution_; + int seed_; + std::mt19937 rng_; +}; + +std::map> seed_to_generator_; + +static RandomFlipGenerator* CreateRandomFlipGenerator(int seed, float prob) { + auto iter = seed_to_generator_.find(seed); + if (iter == seed_to_generator_.end()) { + seed_to_generator_[seed] = std::unique_ptr( + new RandomFlipGenerator(seed, prob)); + } + + return seed_to_generator_[seed].get(); +} + +template +class RandomFlipCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "RandomFlipCPUKernel enter"; + const Tensor* x = ctx.Input("X"); + Tensor* out = ctx.Output("Out"); + + auto prob = ctx.Attr("probability"); + auto seed = ctx.Attr("seed"); + + auto* data = out->mutable_data(ctx.GetPlace()); + auto* generator = CreateRandomFlipGenerator(seed, prob); + for (int64_t i = 0; i < x->dims()[0]; i++) { + data[i] = generator->Generate() ? 1 : 0; + } + LOG(ERROR) << "RandomFlipCPUKernel finish"; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index c49164464bb030..33fa45153fa4fe 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -11,7 +11,8 @@ register_unity_group(cc nvjpeg_decoder.cc dataloader_op.cc map_op.cc - batch_decode_random_crop_op.cc) + batch_decode_random_crop_op.cc + random_flip_op.cc) register_unity_group(cu dataloader_op.cu.cc diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index a08a0ca142053a..2261dfd19b6a6e 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -36,52 +36,57 @@ class FlipOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, platform::errors::NotFound( "Output(Out) of FlipOp should not be null.")); + auto x_dims = ctx->GetInputDim("X"); - auto flip_dims = ctx->Attrs().Get>("axis"); - size_t flip_dims_size = flip_dims.size(); - - if (flip_dims_size > 0) { - // check if dims axis within range - auto min_max_d = std::minmax_element(flip_dims.begin(), flip_dims.end()); - PADDLE_ENFORCE_LT( - *min_max_d.first, x_dims.size(), - platform::errors::InvalidArgument( - "min(axes) should be less than the input tensor X's " - "axes of FlipOp. But received min(axes) = %d, " - "X's axes = %d, X's shape = [%s]", - *min_max_d.first, x_dims.size(), x_dims)); - PADDLE_ENFORCE_GE(*min_max_d.first, x_dims.size() * -1, - platform::errors::InvalidArgument( - "min(axes) should be greater than or equal to the " - "input tensor X's " - "axes of FlipOp times -1. But received " - "min(axes) = %d, X's " - "axes = %d, X's shape = [%s]", - *min_max_d.first, x_dims.size() * -1, x_dims)); - PADDLE_ENFORCE_LT( - *min_max_d.second, x_dims.size(), - platform::errors::InvalidArgument( - "max(axes) should be less than the input tensor X's " - "axes of FlipOp. But received max(axes) = %d, " - "X's axes = %d, X's shape = [%s]", - *min_max_d.second, x_dims.size(), x_dims)); - PADDLE_ENFORCE_GE(*min_max_d.second, x_dims.size() * -1, - platform::errors::InvalidArgument( - "max(axes) should be greater than or equal to the " - "input tensor X's " - "axes of FlipOp times -1. But received " - "max(axes) = %d, X's " - "axes = %d, X's shape = [%s]", - *min_max_d.second, x_dims.size() * -1, x_dims)); - - // check duplicates in dims - flip_dims.erase(std::unique(flip_dims.begin(), flip_dims.end()), - flip_dims.end()); - PADDLE_ENFORCE_EQ(flip_dims.size(), flip_dims_size, - platform::errors::InvalidArgument( - "axes has duplicates, original flip axes size=%d, " - "but unique flip axes size=%d.)", - flip_dims_size, flip_dims.size())); + + if (ctx->IsRuntime()) { + auto flip_dims = ctx->Attrs().Get>("axis"); + size_t flip_dims_size = flip_dims.size(); + + if (flip_dims_size > 0) { + // check if dims axis within range + auto min_max_d = std::minmax_element(flip_dims.begin(), flip_dims.end()); + PADDLE_ENFORCE_LT( + *min_max_d.first, x_dims.size(), + platform::errors::InvalidArgument( + "min(axes) should be less than the input tensor X's " + "axes of FlipOp. But received min(axes) = %d, " + "X's axes = %d, X's shape = [%s]", + *min_max_d.first, x_dims.size(), x_dims)); + PADDLE_ENFORCE_GE(*min_max_d.first, x_dims.size() * -1, + platform::errors::InvalidArgument( + "min(axes) should be greater than or equal to the " + "input tensor X's " + "axes of FlipOp times -1. But received " + "min(axes) = %d, X's " + "axes = %d, X's shape = [%s]", + *min_max_d.first, x_dims.size() * -1, x_dims)); + PADDLE_ENFORCE_LT( + *min_max_d.second, x_dims.size(), + platform::errors::InvalidArgument( + "max(axes) should be less than the input tensor X's " + "axes of FlipOp. But received max(axes) = %d, " + "X's axes = %d, X's shape = [%s]", + *min_max_d.second, x_dims.size(), x_dims)); + PADDLE_ENFORCE_GE(*min_max_d.second, x_dims.size() * -1, + platform::errors::InvalidArgument( + "max(axes) should be greater than or equal to the " + "input tensor X's " + "axes of FlipOp times -1. But received " + "max(axes) = %d, X's " + "axes = %d, X's shape = [%s]", + *min_max_d.second, x_dims.size() * -1, x_dims)); + + // check duplicates in dims + flip_dims.erase(std::unique(flip_dims.begin(), flip_dims.end()), + flip_dims.end()); + PADDLE_ENFORCE_EQ(flip_dims.size(), flip_dims_size, + platform::errors::InvalidArgument( + "axes has duplicates, original flip axes size=%d, " + "but unique flip axes size=%d.)", + flip_dims_size, flip_dims.size())); + } + } VLOG(3) << "flip operator x.shape=" << x_dims; diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index f2da5b12102f7f..2206f39b8abff8 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -127,16 +127,16 @@ def __next__(self): "Pipeline not built, please call build() firstly" self._output_vars = self._prepare_output_vars() - try: - import sys - import time - tic = time.time() - _C_ops.dataloader(self._output_vars, *self._attrs) - toc = time.time() - print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) - sys.stdout.flush() - except: - raise StopIteration + # try: + import sys + import time + tic = time.time() + _C_ops.dataloader(self._output_vars, *self._attrs) + toc = time.time() + print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) + sys.stdout.flush() + # except: + # raise StopIteration return {k: v for k, v in zip(self._out_names, self._output_vars)} diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 5faa991d6b576b..f9084b629c7e54 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1012,6 +1012,20 @@ def image_decode_random_crop(x, return out +def flip_vector(x, prob=0.5, name=None): + helper = LayerHelper("flip_vector", **locals()) + out = helper.create_variable( + name=unique_name.generate("flip_vector"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype=core.VarDesc.VarType.BOOL) + helper.append_op( + type="random_flip", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"probability": prob}) + return out + + def random_flip(x, prob=0.5, name=None): if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") @@ -1023,8 +1037,11 @@ def random_flip(x, prob=0.5, name=None): x[i] = paddle.flip(x[i], -1) return x - p = paddle.uniform([layers.shape(x)[0], 1], min=0., max=1.) - ie = layers.IfElse(p < prob) + # p = paddle.uniform([layers.shape(x)[0], 1], min=0., max=1.) + # prob = paddle.ones([layers.shape(x)[0], 1]) * prob + # cond = layers.less_than(p, prob) + cond = flip_vector(x, prob) + ie = layers.IfElse(cond) with ie.true_block(): out = ie.input(x) out = paddle.flip(x, -1) From f482564e1ec3b7fd1c97e2406632c37bf71e59d1 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Mon, 14 Feb 2022 06:31:08 +0000 Subject: [PATCH 60/95] fix decode error and add layout for decode op --- paddle/fluid/operators/data/CMakeLists.txt | 7 +- .../data/batch_decode_random_crop_op.cc | 7 ++ .../data/batch_decode_random_crop_op.cu | 64 ++++++++++++++++--- .../data/batch_decode_random_crop_op.h | 37 ++++++++++- .../fluid/operators/data/batch_resize_op.cu | 17 +++++ .../operators/data/file_label_loader_op.h | 3 + paddle/fluid/operators/data/nvjpeg_decoder.cc | 28 ++++++-- paddle/fluid/operators/math/math_function.cu | 1 + python/paddle/vision/ops.py | 4 +- 9 files changed, 144 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index bf6470bd02df3f..f16b73ca172720 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -4,10 +4,6 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -# find_package(ZLIB) -# include_directories(${ZLIB_INCLUDE_DIRS}) -# TARGET_LINK_LIBRARIES( ${ZLIB_LIBRARIES}) - cc_library(pipeline SRCS pipeline.cc DEPS parallel_executor simple_threadpool scope) op_library(dataloader_op SRCS dataloader_op.cc dataloader_op.cu.cc DEPS pipeline ${OP_HEADER_DEPS}) @@ -23,9 +19,10 @@ op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpe op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) + op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) # register_operators() # TODO: add test here -# cc_test(xxx SRCS xxx DEPS xxx) +# cc_test(xxx SRCS xxx DEPS xxx \ No newline at end of file diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 2ca56063936d14..7660f7f3ccb5a7 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -132,6 +132,13 @@ and 255. "for optionally converting the image, can be \"unchanged\" " ",\"gray\" , \"rgb\" .") .SetDefault("unchanged"); + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); AddAttr("aspect_ratio_min", "").SetDefault(3./4.); AddAttr("aspect_ratio_max", "").SetDefault(4./3.); AddAttr("area_min", "").SetDefault(0.08); diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index eecf5da9bed9ca..c15e9d0ae3e471 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -16,12 +16,15 @@ #include "paddle/fluid/operators/data/batch_decode_random_crop_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/math/math_function.h" +// #include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using DataLayout = framework::DataLayout; NvjpegDecoderThreadPool* decode_pool = nullptr; // std::seed_seq* rand_seq = nullptr; @@ -50,6 +53,15 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto& out_array = *out->GetMutable(); out_array.resize(inputs->size()); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + framework::LoDTensorArray temp_array; + if (data_layout == DataLayout::kNCHW) { + temp_array.resize(inputs->size()); + } + auto aspect_ratio_min = ctx.Attr("aspect_ratio_min"); auto aspect_ratio_max = ctx.Attr("aspect_ratio_max"); AspectRatioRange aspect_ratio_range{aspect_ratio_min, aspect_ratio_max}; @@ -66,20 +78,52 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const framework::LoDTensor x = inputs->at(i); auto* x_data = x.data(); size_t x_numel = static_cast(x.numel()); - - NvjpegDecodeTask task = { - .bit_stream = x_data, - .bit_len = x_numel, - .tensor = &out_array[i], - .roi_generator = new RandomROIGenerator( - aspect_ratio_range, area_range, rands[i]), - .place = dev - }; - decode_pool->AddTask(std::make_shared(task)); + + if (data_layout == DataLayout::kNCHW){ + NvjpegDecodeTask task = { + .bit_stream = x_data, + .bit_len = x_numel, + .tensor = &temp_array[i], + .roi_generator = new RandomROIGenerator( + aspect_ratio_range, area_range, rands[i]), + .place = dev + }; + decode_pool->AddTask(std::make_shared(task)); + } + else{ + NvjpegDecodeTask task = { + .bit_stream = x_data, + .bit_len = x_numel, + .tensor = &out_array[i], + .roi_generator = new RandomROIGenerator( + aspect_ratio_range, area_range, rands[i]), + .place = dev + }; + decode_pool->AddTask(std::make_shared(task)); + } + } decode_pool->RunAll(true); + if (data_layout == DataLayout::kNCHW){ + const auto& dev_ctx = ctx.cuda_device_context(); + paddle::operators::math::Transpose trans; + std::vector axis = {2, 0, 1}; + // LOG(ERROR) << "start transpose 01!!!"; + for (size_t i = 0; i < inputs->size(); i++) { + // Do transpose + const framework::DDim& in_sizes = temp_array[i].dims(); + // const int ndim = in_sizes.size(); + framework::DDim transposed_input_shape = in_sizes.transpose(axis); + std::vector transposed_input_shape_ = + framework::vectorize(transposed_input_shape); + out_array[i].Resize(transposed_input_shape); + out_array[i].mutable_data(dev_ctx.GetPlace()); + trans(dev_ctx, temp_array[i], &out_array[i], axis); + } + } + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h index fd23be38341dc9..f599c74a7dfb23 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -24,11 +24,46 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/data/nvjpeg_decoder.h" - namespace paddle { namespace operators { namespace data { +// template +// void TransCompute(const int dim, const DeviceContext& dev_ctx, +// const framework::Tensor& in, framework::Tensor* out, +// const std::vector& axis) { +// switch (dim) { +// case 1: +// math::Transpose trans1; +// trans1(dev_ctx, in, out, axis); +// break; +// case 2: +// math::Transpose trans2; +// trans2(dev_ctx, in, out, axis); +// break; +// case 3: +// math::Transpose trans3; +// trans3(dev_ctx, in, out, axis); +// break; +// case 4: +// math::Transpose trans4; +// trans4(dev_ctx, in, out, axis); +// break; +// case 5: +// math::Transpose trans5; +// trans5(dev_ctx, in, out, axis); +// break; +// case 6: +// math::Transpose trans6; +// trans6(dev_ctx, in, out, axis); +// break; +// default: +// // for dim >= 7 situation +// math::TransposeNormal trans_normal; +// trans_normal(dev_ctx, in, out, axis); +// } +// } + template class CPUBatchDecodeRandomCropKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index e2c0319fdcf051..f7a7f52a3703e3 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -234,15 +234,32 @@ class BatchResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); + // int img_h, img_w;//, idx_h, idx_w, crop_h, crop_w; + auto* img = &x->at(0); int64_t img_c = data_layout == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; + LOG(ERROR) << "img channel: " << img_c << " || " << data_layout_str; + // img_h = + // data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; + // img_w = + // data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; + std::vector out_dim = {static_cast(x->size()), + size[0], size[1], img_c}; + if (data_layout == DataLayout::kNCHW) { + out_dim = {static_cast(x->size()), img_c, size[0], size[1]}; + } out->Resize(framework::make_ddim(out_dim)); out->mutable_data(ctx.GetPlace()); + // for (int i = 0; i < x->size(); i++) { + // img = &x->at(i); + // auto out_tensor = out->Slice(i, i + 1); + // TensorCopySync(*img, ctx.GetPlace(), &out_tensor); + // } int img_h, img_w, idx_h, idx_w, crop_h, crop_w; for (int i = 0; i < x->size(); i++) { img = &x->at(i); diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index bbc34a7b546bf1..ffb08a6439b177 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -147,6 +147,7 @@ static void ParseFilesAndLabels(const std::string data_root, } closedir(dir); } + } std::map>> root_to_samples_; @@ -156,6 +157,8 @@ static std::vector>* GetFilesAndLabelsFromCache(cons if (iter == root_to_samples_.end()) { std::vector> samples; ParseFilesAndLabels(data_root, &samples); + std::cout << "files 0: " << samples[0].first << std::endl; + std::cout << "files 1: " << samples[1].first << std::endl; LOG(ERROR) << "Init samples: " << samples.size(); root_to_samples_[data_root] = samples; } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 784b69b28b05c6..56159f81e51f6c 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -85,7 +85,9 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length unsigned char* workspace, size_t workspace_size, framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) { cv::Mat image = + // cv::imdecode(const_cast(data), cv::IMREAD_COLOR); cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); + cv::Mat cropped; int height; int width; @@ -93,24 +95,28 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length ROI roi; roi_generator->GenerateRandomROI(image.cols, image.rows, &roi); cv::Rect cv_roi; + cv_roi.x = roi.x; cv_roi.y = roi.y; cv_roi.width = roi.w; cv_roi.height = roi.h; height = roi.h; width = roi.w; - std::vector out_shape = {3, height, width}; + // std::vector out_shape = {3, height, width}; + std::vector out_shape = {height, width, 3}; temp.Resize(framework::make_ddim(out_shape)); platform::CPUPlace cpu; // allocate memory and assign to out_image auto* data = temp.mutable_data(cpu); - cropped.data = data; + // todo jianglielin: why not work? + // cropped.data = data; image(cv_roi).copyTo(cropped); - out->Resize(framework::make_ddim(out_shape)); - + + std::memcpy(data, cropped.data, 3 * height * width); + TensorCopySync(temp, place, out); - } else { + LOG(ERROR) << "Not Use Opencv decode!!!"; // throw error } } @@ -139,6 +145,9 @@ int NvjpegDecoder::ParseDecodeParams( return 1; #endif } + else{ + // LOG(ERROR) << "Use nvjpeg decode!!!"; + } int64_t width = static_cast(widths[0]); int64_t height = static_cast(heights[0]); @@ -174,13 +183,16 @@ int NvjpegDecoder::ParseDecodeParams( if (roi_generator) { ROI roi; roi_generator->GenerateRandomROI(width, height, &roi); - + // roi.x = 0; + // roi.y = 0; + // roi.w = 500; + // roi.h = 400; PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); height = roi.h; width = roi.w; } - std::vector out_shape = {output_components, height, width}; + std::vector out_shape = {height, width, output_components}; out->Resize(framework::make_ddim(out_shape)); // allocate memory and assign to out_image @@ -217,7 +229,9 @@ void NvjpegDecoder::Run( if (res) { return; } + // LOG(ERROR) << "ParseDecodeParams finish !!!"; Decode(bit_stream, bit_len, &image); + // LOG(ERROR) << "Decode finish !!!"; } NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id) diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index cfdfa456e39eac..0ee26752aebc34 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -50,6 +50,7 @@ template struct SetConstant; \ template struct Transpose; \ template struct Transpose; \ + template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ template struct Transpose Date: Mon, 14 Feb 2022 06:37:21 +0000 Subject: [PATCH 61/95] clean code --- .../data/batch_decode_random_crop_op.h | 36 ------------------- .../fluid/operators/data/batch_resize_op.cu | 13 ------- .../operators/data/file_label_loader_op.h | 4 +-- paddle/fluid/operators/data/nvjpeg_decoder.cc | 9 ++--- 4 files changed, 4 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h index f599c74a7dfb23..de96e38ca95ef9 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -28,42 +28,6 @@ namespace paddle { namespace operators { namespace data { -// template -// void TransCompute(const int dim, const DeviceContext& dev_ctx, -// const framework::Tensor& in, framework::Tensor* out, -// const std::vector& axis) { -// switch (dim) { -// case 1: -// math::Transpose trans1; -// trans1(dev_ctx, in, out, axis); -// break; -// case 2: -// math::Transpose trans2; -// trans2(dev_ctx, in, out, axis); -// break; -// case 3: -// math::Transpose trans3; -// trans3(dev_ctx, in, out, axis); -// break; -// case 4: -// math::Transpose trans4; -// trans4(dev_ctx, in, out, axis); -// break; -// case 5: -// math::Transpose trans5; -// trans5(dev_ctx, in, out, axis); -// break; -// case 6: -// math::Transpose trans6; -// trans6(dev_ctx, in, out, axis); -// break; -// default: -// // for dim >= 7 situation -// math::TransposeNormal trans_normal; -// trans_normal(dev_ctx, in, out, axis); -// } -// } - template class CPUBatchDecodeRandomCropKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index f7a7f52a3703e3..7728a6b4631631 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -234,18 +234,10 @@ class BatchResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); - // int img_h, img_w;//, idx_h, idx_w, crop_h, crop_w; - auto* img = &x->at(0); int64_t img_c = data_layout == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; - LOG(ERROR) << "img channel: " << img_c << " || " << data_layout_str; - // img_h = - // data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; - // img_w = - // data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; - std::vector out_dim = {static_cast(x->size()), size[0], size[1], img_c}; if (data_layout == DataLayout::kNCHW) { @@ -255,11 +247,6 @@ class BatchResizeCUDAKernel : public framework::OpKernel { out->Resize(framework::make_ddim(out_dim)); out->mutable_data(ctx.GetPlace()); - // for (int i = 0; i < x->size(); i++) { - // img = &x->at(i); - // auto out_tensor = out->Slice(i, i + 1); - // TensorCopySync(*img, ctx.GetPlace(), &out_tensor); - // } int img_h, img_w, idx_h, idx_w, crop_h, crop_w; for (int i = 0; i < x->size(); i++) { img = &x->at(i); diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index ffb08a6439b177..eef07790372a24 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -157,8 +157,8 @@ static std::vector>* GetFilesAndLabelsFromCache(cons if (iter == root_to_samples_.end()) { std::vector> samples; ParseFilesAndLabels(data_root, &samples); - std::cout << "files 0: " << samples[0].first << std::endl; - std::cout << "files 1: " << samples[1].first << std::endl; + // std::cout << "files 0: " << samples[0].first << std::endl; + // std::cout << "files 1: " << samples[1].first << std::endl; LOG(ERROR) << "Init samples: " << samples.size(); root_to_samples_[data_root] = samples; } diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/nvjpeg_decoder.cc index 56159f81e51f6c..188723ba306fea 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/nvjpeg_decoder.cc @@ -85,7 +85,6 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length unsigned char* workspace, size_t workspace_size, framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) { cv::Mat image = - // cv::imdecode(const_cast(data), cv::IMREAD_COLOR); cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); cv::Mat cropped; @@ -95,14 +94,13 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length ROI roi; roi_generator->GenerateRandomROI(image.cols, image.rows, &roi); cv::Rect cv_roi; - cv_roi.x = roi.x; cv_roi.y = roi.y; cv_roi.width = roi.w; cv_roi.height = roi.h; height = roi.h; width = roi.w; - // std::vector out_shape = {3, height, width}; + std::vector out_shape = {height, width, 3}; temp.Resize(framework::make_ddim(out_shape)); platform::CPUPlace cpu; @@ -183,10 +181,7 @@ int NvjpegDecoder::ParseDecodeParams( if (roi_generator) { ROI roi; roi_generator->GenerateRandomROI(width, height, &roi); - // roi.x = 0; - // roi.y = 0; - // roi.w = 500; - // roi.h = 400; + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); height = roi.h; width = roi.w; From baf2f551d9b1b3791c002a1ce5b41299662f9fdb Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 14 Feb 2022 07:15:38 +0000 Subject: [PATCH 62/95] add mirror_normalize --- paddle/fluid/operators/data/CMakeLists.txt | 1 + .../operators/data/mirror_normalize_op.cc | 104 ++++++++++++++++++ .../operators/data/mirror_normalize_op.cu | 95 ++++++++++++++++ .../operators/data/mirror_normalize_op.h | 38 +++++++ paddle/fluid/operators/data/random_flip_op.h | 2 +- .../operators/data/unity_build_rule.cmake | 6 +- python/paddle/vision/ops.py | 56 ++++++---- 7 files changed, 276 insertions(+), 26 deletions(-) create mode 100644 paddle/fluid/operators/data/mirror_normalize_op.cc create mode 100644 paddle/fluid/operators/data/mirror_normalize_op.cu create mode 100644 paddle/fluid/operators/data/mirror_normalize_op.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 915bda52a4de69..10db116fcf7dd7 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -27,6 +27,7 @@ op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_ op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) op_library(random_flip_op SRCS random_flip_op.cc DEPS ${OP_HEADER_DEPS}) +op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.cu DEPS ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cc b/paddle/fluid/operators/data/mirror_normalize_op.cc new file mode 100644 index 00000000000000..31f9c3b879dd79 --- /dev/null +++ b/paddle/fluid/operators/data/mirror_normalize_op.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/data/mirror_normalize_op.h" + +namespace paddle { +namespace operators { +namespace data { + +class MirrorNormalizeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::NotFound("Input(X) of MirrorNormalizeOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Mirror"), true, + platform::errors::NotFound("Input(Mirror) of MirrorNormalizeOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::NotFound( + "Output(Out) of MirrorNormalizeOp should not be null.")); + + auto x_dims = ctx->GetInputDim("X"); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(x_dims.size(), 4, + platform::errors::NotFound( + "Input(X) of MirrorNormalizeOp should be a 4-D Tensor")); + + auto c = x_dims[1]; + auto mean = ctx->Attrs().Get>("mean"); + auto std = ctx->Attrs().Get>("std"); + PADDLE_ENFORCE_EQ(mean.size(), c, + platform::errors::NotFound( + "The channel number of Input(X) should equal to length of mean")); + PADDLE_ENFORCE_EQ(mean.size(), c, + platform::errors::NotFound( + "The channel number of Input(X) should equal to length of mean")); + } + + std::vector output_dims(x_dims.size()); + for (int i = 0; i < x_dims.size(); ++i) { + output_dims[i] = x_dims[i]; + } + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + ctx->ShareLoD("X", "Out"); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class MirrorNormalizeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of mirror_normalize op."); + AddInput("Mirror", "(Tensor), The mirror vector for random flip, the " + "shape is {N, 1}, N is the batch size of input X"); + AddOutput("Out", "(Tensor), The output tensor in the same shape as " + "input X."); + AddAttr>("mean", "The mean value to normalize data"); + AddAttr>("std", "The stdvalue to normalize data"); + AddComment(R"DOC( + Mirror Normalize Operator. + )DOC"); + } +}; + +class MirrorNormalizeOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Out"}}; + return m; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::data; +namespace plat = paddle::platform; +REGISTER_OPERATOR(mirror_normalize, ops::MirrorNormalizeOp, ops::MirrorNormalizeOpMaker, ops::MirrorNormalizeOpInferVarType); + +REGISTER_OP_CPU_KERNEL( + mirror_normalize, ops::MirrorNormalizeCPUKernel, + ops::MirrorNormalizeCPUKernel, + ops::MirrorNormalizeCPUKernel); diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cu b/paddle/fluid/operators/data/mirror_normalize_op.cu new file mode 100644 index 00000000000000..f4371ee16ca805 --- /dev/null +++ b/paddle/fluid/operators/data/mirror_normalize_op.cu @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/mirror_normalize_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" + +namespace paddle { +namespace operators { +namespace data { + +using framework::LoDTensor; + +template +__global__ void KeMirrorNormalize( + const int numel, const T* in_data, const bool* mirrors, T* out_data, + const float* mean, const float* std, const int chw, const int hw, + const int w) { + CUDA_KERNEL_LOOP(idx, numel) { + int ni = idx / chw; + int ci = (idx % chw) / hw; + int wi = idx % w; + + int out_idx = idx - 2 * wi + w - 1; + out_data[out_idx] = (in_data[idx] - mean[ci]) / std[ci]; + } +} + +template +class MirrorNormalizeCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + LOG(ERROR) << "MirrorNormalizeCUDAKernel Compute start"; + auto* x = ctx.Input("X"); + auto* mirror = ctx.Input("Mirror"); + auto* out = ctx.Output("Out"); + + auto mean = ctx.Attr>("mean"); + auto std = ctx.Attr>("std"); + + auto numel = x->numel(); + auto n = x->dims()[0]; + auto c = x->dims()[1]; + auto h = x->dims()[2]; + auto w = x->dims()[3]; + auto hw = h * w; + auto chw = c * hw; + + const T* x_data = x->data(); + const bool* mirror_data = mirror->data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.cuda_device_context(); + const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + const auto cplace = platform::CPUPlace(); + int bytes = sizeof(float) * mean.size(); + + auto mean_ptr = memory::Alloc(dev_ctx, bytes); + float* mean_data = reinterpret_cast(mean_ptr->ptr()); + memory::Copy(gplace, mean_data, cplace, mean.data(), bytes, + dev_ctx.stream()); + auto std_ptr = memory::Alloc(dev_ctx, bytes); + float* std_data = reinterpret_cast(std_ptr->ptr()); + memory::Copy(gplace, std_data, cplace, std.data(), bytes, + dev_ctx.stream()); + + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, numel); + KeMirrorNormalize<<>>( + numel, x_data, mirror_data, out_data, mean_data, std_data, chw, hw, w); + LOG(ERROR) << "MirrorNormalizeCUDAKernel Compute finish"; + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(mirror_normalize, + ops::data::MirrorNormalizeCUDAKernel, + ops::data::MirrorNormalizeCUDAKernel, + ops::data::MirrorNormalizeCUDAKernel); diff --git a/paddle/fluid/operators/data/mirror_normalize_op.h b/paddle/fluid/operators/data/mirror_normalize_op.h new file mode 100644 index 00000000000000..fce477c527dc84 --- /dev/null +++ b/paddle/fluid/operators/data/mirror_normalize_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace data { + +template +class MirrorNormalizeCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // no cpu kernel. + PADDLE_THROW(platform::errors::Unimplemented( + "BatchResize op only supports GPU now.")); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data/random_flip_op.h b/paddle/fluid/operators/data/random_flip_op.h index e8f31e1fe69c28..f1564fc2ed4521 100644 --- a/paddle/fluid/operators/data/random_flip_op.h +++ b/paddle/fluid/operators/data/random_flip_op.h @@ -75,7 +75,7 @@ class RandomFlipCPUKernel : public framework::OpKernel { auto* data = out->mutable_data(ctx.GetPlace()); auto* generator = CreateRandomFlipGenerator(seed, prob); for (int64_t i = 0; i < x->dims()[0]; i++) { - data[i] = generator->Generate() ? 1 : 0; + data[i] = generator->Generate(); } LOG(ERROR) << "RandomFlipCPUKernel finish"; } diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index 33fa45153fa4fe..e9bde0c1f0ccc7 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -12,9 +12,11 @@ register_unity_group(cc dataloader_op.cc map_op.cc batch_decode_random_crop_op.cc - random_flip_op.cc) + random_flip_op.cc + mirror_normalize_op.cc) register_unity_group(cu dataloader_op.cu.cc map_op.cu.cc - batch_decode_random_crop_op.cu) + batch_decode_random_crop_op.cu, + mirror_normalize_op.cu) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index f9084b629c7e54..6af427aa82a2db 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1012,10 +1012,13 @@ def image_decode_random_crop(x, return out -def flip_vector(x, prob=0.5, name=None): - helper = LayerHelper("flip_vector", **locals()) +def random_flip(x, prob=0.5, name=None): + if prob < 0. or prob > 1.: + raise ValueError("prob should in (0, 1) in random_flip") + + helper = LayerHelper("random_flip", **locals()) out = helper.create_variable( - name=unique_name.generate("flip_vector"), + name=unique_name.generate("random_flip"), type=core.VarDesc.VarType.LOD_TENSOR, dtype=core.VarDesc.VarType.BOOL) helper.append_op( @@ -1026,27 +1029,34 @@ def flip_vector(x, prob=0.5, name=None): return out -def random_flip(x, prob=0.5, name=None): - if prob < 0. or prob > 1.: - raise ValueError("prob should in (0, 1) in random_flip") +def mirror_normalize(x, mirror, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.120, 57.375], + name=None): + def _to_list_3(l): + if isinstance(l, (list, tuple)): + assert len(l) == 1 or len(l) == 3, \ + "input list length should be 1 or 3" + if len(l) == 1: + l = l * 3 + return l + else: + return [l] * 3 - if in_dygraph_mode(): - p = np.random.uniform(0., 1., x.shape[0:1]) - for i in range(x.shape[0]): - if p[i] < prob: - x[i] = paddle.flip(x[i], -1) - return x - - # p = paddle.uniform([layers.shape(x)[0], 1], min=0., max=1.) - # prob = paddle.ones([layers.shape(x)[0], 1]) * prob - # cond = layers.less_than(p, prob) - cond = flip_vector(x, prob) - ie = layers.IfElse(cond) - with ie.true_block(): - out = ie.input(x) - out = paddle.flip(x, -1) - ie.output(out) - return ie()[0] + mean = _to_list_3(mean) + std = _to_list_3(std) + + helper = LayerHelper("mirror_normalize", **locals()) + out = helper.create_variable( + name=unique_name.generate("mirror_normalize"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype=core.VarDesc.VarType.BOOL) + helper.append_op( + type="mirror_normalize", + inputs={"X": x, "Mirror": mirror}, + outputs={"Out": out}, + attrs={"mean": mean, "std": std}) + return out def decode_jpeg(x, mode='unchanged', name=None): From da718cd99485a52c864c5330984c1774235fa3b0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 14 Feb 2022 07:35:15 +0000 Subject: [PATCH 63/95] revert pipeline debug --- python/paddle/fluid/dataloader/pipeline.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 2206f39b8abff8..f2da5b12102f7f 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -127,16 +127,16 @@ def __next__(self): "Pipeline not built, please call build() firstly" self._output_vars = self._prepare_output_vars() - # try: - import sys - import time - tic = time.time() - _C_ops.dataloader(self._output_vars, *self._attrs) - toc = time.time() - print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) - sys.stdout.flush() - # except: - # raise StopIteration + try: + import sys + import time + tic = time.time() + _C_ops.dataloader(self._output_vars, *self._attrs) + toc = time.time() + print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) + sys.stdout.flush() + except: + raise StopIteration return {k: v for k, v in zip(self._out_names, self._output_vars)} From a764fca21c72aa304662c3f4204ffa32569fade8 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 14 Feb 2022 13:43:08 +0000 Subject: [PATCH 64/95] fix flip_normalize output error --- paddle/fluid/operators/data/mirror_normalize_op.cc | 3 +-- paddle/fluid/operators/data/mirror_normalize_op.cu | 4 ++-- python/paddle/fluid/dataloader/ops.py | 5 ++++- python/paddle/vision/ops.py | 7 +++---- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cc b/paddle/fluid/operators/data/mirror_normalize_op.cc index 31f9c3b879dd79..2efab2e3f1ab1a 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cc +++ b/paddle/fluid/operators/data/mirror_normalize_op.cc @@ -100,5 +100,4 @@ REGISTER_OPERATOR(mirror_normalize, ops::MirrorNormalizeOp, ops::MirrorNormalize REGISTER_OP_CPU_KERNEL( mirror_normalize, ops::MirrorNormalizeCPUKernel, - ops::MirrorNormalizeCPUKernel, - ops::MirrorNormalizeCPUKernel); + ops::MirrorNormalizeCPUKernel); diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cu b/paddle/fluid/operators/data/mirror_normalize_op.cu index f4371ee16ca805..9c62e41b5af462 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cu +++ b/paddle/fluid/operators/data/mirror_normalize_op.cu @@ -32,7 +32,8 @@ __global__ void KeMirrorNormalize( int ci = (idx % chw) / hw; int wi = idx % w; - int out_idx = idx - 2 * wi + w - 1; + int out_idx = idx; + if (mirrors[ni]) out_idx = idx - 2 * wi + w - 1; out_data[out_idx] = (in_data[idx] - mean[ci]) / std[ci]; } } @@ -90,6 +91,5 @@ class MirrorNormalizeCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(mirror_normalize, - ops::data::MirrorNormalizeCUDAKernel, ops::data::MirrorNormalizeCUDAKernel, ops::data::MirrorNormalizeCUDAKernel); diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 4b2a569a8bec1c..0916d4e00d0294 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -76,7 +76,8 @@ def map(map_func, inputs): program_inputs = [ map_block.create_var( name=unique_name.generate("map_sub"), - type=inputs[0].desc.type(), + type=inp.desc.type(), + dtype=inp.desc.dtype(), persistable=False) for inp in inputs] program_outputs = map_func(*program_inputs) program_outputs = _to_list(program_outputs) @@ -88,6 +89,7 @@ def map(map_func, inputs): [helper.create_variable( name=unique_name.generate("map"), type=outp.desc.type(), + dtype=outp.desc.dtype(), persistable=True) for outp in program_outputs] attrs = { "map_block": map_block, @@ -138,6 +140,7 @@ def data_reader(reader_func, [helper.create_variable( name=unique_name.generate("data_reader"), type=outp.desc.type(), + dtype=outp.desc.dtype(), persistable=True) for outp in program_outputs] attrs = { diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index ee5b5f70af346b..738fb104ee2c39 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1045,14 +1045,13 @@ def _to_list_3(l): else: return [l] * 3 + x = paddle.cast(x, dtype='float32') mean = _to_list_3(mean) std = _to_list_3(std) helper = LayerHelper("mirror_normalize", **locals()) - out = helper.create_variable( - name=unique_name.generate("mirror_normalize"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype=core.VarDesc.VarType.BOOL) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="mirror_normalize", inputs={"X": x, "Mirror": mirror}, From a1300869b1851c45dac938f88846cb05b8dd45a1 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Thu, 17 Feb 2022 12:07:24 +0000 Subject: [PATCH 65/95] fix index --- paddle/fluid/operators/data/data_reader_op.h | 45 ++++++++++++++------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index daa41182722328..1f2ffe894d9237 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -45,15 +45,27 @@ class Sampler { const int rank, const int world_size) : current_iter_(0), batch_size_(batch_size), - num_samples_(num_samples), + // num_samples_(num_samples), drop_last_(drop_last), rank_(rank), world_size_(world_size) { LOG(ERROR) << "Sampler num_samples " << num_samples; - sample_ids_.reserve(num_samples); - for (int64_t i = 0; i < num_samples; i++) { + int trunc_num_samples; + if (drop_last) { + int total_batch_size = world_size * batch_size; + trunc_num_samples = floor(num_samples / total_batch_size) * total_batch_size; + sample_ids_.reserve(trunc_num_samples); + LOG(ERROR) << " Trunc sampler num_samples " << trunc_num_samples; + } + else{ + sample_ids_.reserve(num_samples); + trunc_num_samples = num_samples; + } + for (int64_t i = 0; i < trunc_num_samples; i++) { sample_ids_.emplace_back(i); } + num_samples_ = sample_ids_.size(); + LOG(ERROR) << " Final num_samples " << num_samples_; if (shuffle) { rnd_.seed(time(0)); std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); @@ -62,23 +74,32 @@ class Sampler { void GetNextIndices(std::vector* indices) { int64_t start_idx = - batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; + batch_size_ * world_size_ * current_iter_ + rank_; + // batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; current_iter_++; - if (start_idx >= num_samples_) return; - if (drop_last_ && start_idx + batch_size_ >= num_samples_) return; - - int64_t batch_len = std::min(batch_size_, num_samples_ - start_idx); - indices->reserve(batch_len); - for (int64_t i = 0; i < batch_len; i++) { - indices->emplace_back(sample_ids_[start_idx + i]); + if (start_idx >= num_samples_) { + LOG(ERROR) << " start idx >= num samples " << start_idx << " >= " << num_samples_; + return; + } + // if (drop_last_ && start_idx + batch_size_ >= num_samples_) return; + + // int64_t batch_len = std::min(batch_size_, num_samples_ - start_idx); + // indices->reserve(batch_len); + for (int64_t i = 0; i < batch_size_; i++) { + int cur_idx = start_idx + i * world_size_; + if (cur_idx >= num_samples_) { + LOG(ERROR) << " cur_idx >= num samples " << cur_idx << " >= " << num_samples_; + return; + } + indices->emplace_back(sample_ids_[cur_idx]); } } private: int64_t current_iter_; const int64_t batch_size_; - const int64_t num_samples_; + int64_t num_samples_; const bool drop_last_; const int rank_; const int world_size_; From f867fcac23276f6f84a7944c5466a95a1cc1d2fa Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 28 Feb 2022 15:36:59 +0000 Subject: [PATCH 66/95] fix memory leak --- paddle/fluid/operators/data/map_runner.cc | 1 + paddle/fluid/operators/data/pipeline.cc | 2 ++ python/paddle/vision/ops.py | 23 +++++++++++++---------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index e6bc75802d9de8..e1b39f44adef67 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -92,6 +92,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // share input tensor to dst variable auto& dst_tensor_arr = *(var->GetMutable()); + for (auto &tensor: dst_tensor_arr) tensor.clear(); dst_tensor_arr.clear(); dst_tensor_arr.reserve(tensor_arr.size()); for (size_t i = 0; i < tensor_arr.size(); i++) { diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 8d4dc269fb6996..7854c725bd9dee 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -103,6 +103,8 @@ void Pipeline::ReadNext(std::vector &out_vars) { // CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); copy_tensor(outputs.at(0), out_vars[i]->GetMutable()); + for (auto &output: outputs) output.clear(); + outputs.clear(); } } diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 738fb104ee2c39..4b19f818f826ed 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1018,16 +1018,19 @@ def random_flip(x, prob=0.5, name=None): if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") - helper = LayerHelper("random_flip", **locals()) - out = helper.create_variable( - name=unique_name.generate("random_flip"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype=core.VarDesc.VarType.BOOL) - helper.append_op( - type="random_flip", - inputs={"X": x}, - outputs={"Out": out}, - attrs={"probability": prob}) + rand_vec = layers.uniform_random_batch_size_like( + x, [1, 1], min=0., max=1.) + return rand_vec < prob + # helper = LayerHelper("random_flip", **locals()) + # out = helper.create_variable( + # name=unique_name.generate("random_flip"), + # type=core.VarDesc.VarType.LOD_TENSOR, + # dtype=core.VarDesc.VarType.BOOL) + # helper.append_op( + # type="random_flip", + # inputs={"X": x}, + # outputs={"Out": out}, + # attrs={"probability": prob}) return out From 8c59fa80718edc97e21f918944e4792eb9523d09 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 1 Mar 2022 14:11:16 +0000 Subject: [PATCH 67/95] fix memory leak --- .../data/batch_decode_random_crop_op.cu | 23 +++++---- .../operators/data/random_roi_generator.cc | 4 ++ .../operators/data/random_roi_generator.h | 47 +++++++++++++++++++ 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index c15e9d0ae3e471..0f6ddef5d19939 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -14,10 +14,11 @@ #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) +#include +#include #include "paddle/fluid/operators/data/batch_decode_random_crop_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/math/math_function.h" -// #include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { @@ -27,7 +28,6 @@ using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHo using DataLayout = framework::DataLayout; NvjpegDecoderThreadPool* decode_pool = nullptr; -// std::seed_seq* rand_seq = nullptr; template class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { @@ -45,13 +45,14 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const framework::LoDTensorArray* inputs = ctx.Input("X"); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << inputs->size() << ", program_id: " << program_id; + int batch_size = inputs->size(); + LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << batch_size << ", program_id: " << program_id; auto* out = ctx.OutputVar("Out"); auto dev = platform::CUDAPlace(local_rank); auto& out_array = *out->GetMutable(); - out_array.resize(inputs->size()); + out_array.resize(batch_size); const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = @@ -59,7 +60,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { framework::LoDTensorArray temp_array; if (data_layout == DataLayout::kNCHW) { - temp_array.resize(inputs->size()); + temp_array.resize(batch_size); } auto aspect_ratio_min = ctx.Attr("aspect_ratio_min"); @@ -70,9 +71,9 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto area_max = ctx.Attr("area_max"); AreaRange area_range{area_min, area_max}; - std::seed_seq rand_seq{static_cast(time(0))}; - std::vector rands(inputs->size()); - rand_seq.generate(rands.begin(), rands.end()); + auto* generators = GeneratorManager::Instance()->GetGenerators( + program_id, batch_size, aspect_ratio_range, + area_range); for (size_t i = 0; i < inputs->size(); i++) { const framework::LoDTensor x = inputs->at(i); @@ -84,8 +85,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { .bit_stream = x_data, .bit_len = x_numel, .tensor = &temp_array[i], - .roi_generator = new RandomROIGenerator( - aspect_ratio_range, area_range, rands[i]), + .roi_generator = generators->at(i).get(), .place = dev }; decode_pool->AddTask(std::make_shared(task)); @@ -95,8 +95,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { .bit_stream = x_data, .bit_len = x_numel, .tensor = &out_array[i], - .roi_generator = new RandomROIGenerator( - aspect_ratio_range, area_range, rands[i]), + .roi_generator = generators->at(i).get(), .place = dev }; decode_pool->AddTask(std::make_shared(task)); diff --git a/paddle/fluid/operators/data/random_roi_generator.cc b/paddle/fluid/operators/data/random_roi_generator.cc index 9adc457f5745ef..8bdb225f724254 100644 --- a/paddle/fluid/operators/data/random_roi_generator.cc +++ b/paddle/fluid/operators/data/random_roi_generator.cc @@ -99,6 +99,10 @@ void RandomROIGenerator::GenerateRandomROI( } } +// initialization static variables out of GeneratorManager +GeneratorManager* GeneratorManager::gm_instance_ptr_ = nullptr; +std::mutex GeneratorManager::m_; + } // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/data/random_roi_generator.h b/paddle/fluid/operators/data/random_roi_generator.h index 32b540a57fc756..80e2675817b0d3 100644 --- a/paddle/fluid/operators/data/random_roi_generator.h +++ b/paddle/fluid/operators/data/random_roi_generator.h @@ -17,6 +17,9 @@ limitations under the License. */ #include #include #include +#include +#include +#include namespace paddle { namespace operators { @@ -55,6 +58,50 @@ class RandomROIGenerator { int64_t num_attempts_; }; +class GeneratorManager { + using Generators = std::vector>; + + private: + // DISABLE_COPY_AND_ASSIGN(GeneratorManager); + + static GeneratorManager* gm_instance_ptr_; + static std::mutex m_; + + std::map> prog_id_to_generators_; + + public: + static GeneratorManager *Instance() { + if (gm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); + if (gm_instance_ptr_ == nullptr) { + gm_instance_ptr_ = new GeneratorManager; + } + } + return gm_instance_ptr_; + } + + Generators* GetGenerators(const int64_t program_id, const int batch_size, + const AspectRatioRange aspect_ratio_range, + const AreaRange area_range) { + auto iter = prog_id_to_generators_.find(program_id); + if (iter == prog_id_to_generators_.end()) { + prog_id_to_generators_[program_id] = std::unique_ptr( + new Generators(batch_size)); + + std::seed_seq rand_seq{static_cast(time(0))}; + std::vector rands(batch_size); + rand_seq.generate(rands.begin(), rands.end()); + + for (int i = 0; i < batch_size; i++) { + prog_id_to_generators_[program_id]->at(i).reset( + new RandomROIGenerator(aspect_ratio_range, + area_range, rands[i])); + } + } + return prog_id_to_generators_[program_id].get(); + } +}; + } // namespace data } // namespace operators } // namespace paddle From b5213fe4ac39d7753bf0d7a7910d0c32d060bb78 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 3 Mar 2022 12:36:59 +0000 Subject: [PATCH 68/95] add barrier --- python/paddle/fluid/dataloader/pipeline.py | 7 ++++++ python/paddle/vision/ops.py | 26 +++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index f2da5b12102f7f..309f2afdc9548a 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -48,6 +48,11 @@ def __init__(self, queue_depth=2): self._init_programs() self.is_shutdown = False + + if paddle.distributed.ParallelEnv().nranks > 1: + paddle.set_device('gpu:%d' % + paddle.distributed.ParallelEnv().dev_id) + paddle.distributed.init_parallel_env() def _init_programs(self): self._main_program = fluid.Program() @@ -138,6 +143,8 @@ def __next__(self): except: raise StopIteration + if paddle.distributed.ParallelEnv().nranks > 1: + paddle.distributed.barrier() return {k: v for k, v in zip(self._out_names, self._output_vars)} # Python 2 compatable diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 4b19f818f826ed..71e2f790ecc5cb 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1018,19 +1018,19 @@ def random_flip(x, prob=0.5, name=None): if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") - rand_vec = layers.uniform_random_batch_size_like( - x, [1, 1], min=0., max=1.) - return rand_vec < prob - # helper = LayerHelper("random_flip", **locals()) - # out = helper.create_variable( - # name=unique_name.generate("random_flip"), - # type=core.VarDesc.VarType.LOD_TENSOR, - # dtype=core.VarDesc.VarType.BOOL) - # helper.append_op( - # type="random_flip", - # inputs={"X": x}, - # outputs={"Out": out}, - # attrs={"probability": prob}) + # rand_vec = layers.uniform_random_batch_size_like( + # x, [1, 1], min=0., max=1.) + # return rand_vec < prob + helper = LayerHelper("random_flip", **locals()) + out = helper.create_variable( + name=unique_name.generate("random_flip"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype=core.VarDesc.VarType.BOOL) + helper.append_op( + type="random_flip", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"probability": prob}) return out From b0d88eedbda67745be2655c47149ea9b912c8982 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 23 Mar 2022 14:25:35 +0000 Subject: [PATCH 69/95] fix training hang --- paddle/fluid/framework/operator.cc | 4 +- paddle/fluid/operators/data/CMakeLists.txt | 8 +- .../fluid/operators/data/batch_decode_op.cc | 8 + .../fluid/operators/data/batch_decode_op.cu | 24 ++- paddle/fluid/operators/data/batch_decode_op.h | 2 +- .../data/batch_decode_random_crop_op.cc | 8 + .../data/batch_decode_random_crop_op.cu | 20 ++- .../data/batch_decode_random_crop_op.h | 2 +- .../{nvjpeg_decoder.cc => image_decoder.cc} | 165 +++++++++--------- .../{nvjpeg_decoder.h => image_decoder.h} | 89 ++++++---- paddle/fluid/operators/data/map_op.cc | 4 - .../operators/data/mirror_normalize_op.cu | 2 - paddle/fluid/operators/data/random_flip_op.h | 2 - paddle/fluid/operators/data/shutdown.h | 16 +- paddle/fluid/platform/device_context.cc | 11 -- paddle/fluid/platform/dynload/nvjpeg.h | 2 + python/paddle/fluid/dataloader/ops.py | 4 +- python/paddle/vision/ops.py | 54 +++--- 18 files changed, 226 insertions(+), 199 deletions(-) rename paddle/fluid/operators/data/{nvjpeg_decoder.cc => image_decoder.cc} (65%) rename paddle/fluid/operators/data/{nvjpeg_decoder.h => image_decoder.h} (60%) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 75b1ba8fd87d53..2bd0589eac020c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1110,9 +1110,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place, RuntimeContext* runtime_ctx) const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = HasAttr("stream_id") ? + auto* dev_ctx = HasAttr("_stream_id") ? platform::AsyncDeviceContextPool::Instance().Get( - place, Attr("stream_id")) : nullptr; + place, Attr("_stream_id")) : nullptr; if (dev_ctx == nullptr) { dev_ctx = pool.Get(place); } diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index d9618b320a3376..386529476f947b 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -13,9 +13,9 @@ cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpoo op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) -cc_library(nvjpeg_decoder SRCS nvjpeg_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS} ${OpenCV_LIBS}) -op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) -op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS nvjpeg_decoder ${OP_HEADER_DEPS}) +cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS} ${OpenCV_LIBS}) +op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) +op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) @@ -28,4 +28,4 @@ op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.c # register_operators() # TODO: add test here -# cc_test(xxx SRCS xxx DEPS xxx \ No newline at end of file +# cc_test(xxx SRCS xxx DEPS xxx diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index ebe7908ac6e0f1..99355eaebf556a 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -82,6 +82,14 @@ and 255. "(int64_t)" "The unique hash id used as cache key for " "decode thread pool"); + AddAttr( + "host_memory_padding", + "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding") + .SetDefault(0); + AddAttr( + "device_memory_padding", + "(int64, default 0), device memory allocation padding number for Nvjpeg decoding") + .SetDefault(0); } }; diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index c77b81f4ecd927..f9b2a65d397d1d 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -23,8 +23,6 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; -// static NvjpegDecoderThreadPool* decode_pool = nullptr; - template class GPUBatchDecodeKernel : public framework::OpKernel { public: @@ -34,16 +32,16 @@ class GPUBatchDecodeKernel : public framework::OpKernel { auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); auto program_id = ctx.Attr("program_id"); - - // // multi-phrase decode thread pool - // if (!decode_pool) { - // LOG(ERROR) << "GPUBatchDecodeJpegKernel decode_pool init"; - // decode_pool = new NvjpegDecoderThreadPool(num_threads, mode, local_rank); - // } - auto* decode_pool = - DecoderThreadPoolManager::Instance()->GetDecoderThreadPool( - program_id, num_threads, mode, local_rank); + auto host_memory_padding = ctx.Attr("host_memory_padding"); + auto device_memory_padding = ctx.Attr("device_memory_padding"); + // multi-phrase decode thread pool + auto* decode_pool = + ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( + program_id, num_threads, mode, local_rank, + static_cast(host_memory_padding), + static_cast(device_memory_padding)); + const framework::LoDTensorArray* inputs = ctx.Input("X"); @@ -56,14 +54,14 @@ class GPUBatchDecodeKernel : public framework::OpKernel { auto* x_data = x.data(); size_t x_numel = static_cast(x.numel()); - NvjpegDecodeTask task = { + ImageDecodeTask task = { .bit_stream = x_data, .bit_len = x_numel, .tensor = &out_array[i], .roi_generator = nullptr, .place = ctx.GetPlace() }; - decode_pool->AddTask(std::make_shared(task)); + decode_pool->AddTask(std::make_shared(task)); } decode_pool->RunAll(true); diff --git a/paddle/fluid/operators/data/batch_decode_op.h b/paddle/fluid/operators/data/batch_decode_op.h index a16385b594c293..5f98172f27990c 100644 --- a/paddle/fluid/operators/data/batch_decode_op.h +++ b/paddle/fluid/operators/data/batch_decode_op.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/image_decoder.h" namespace paddle { diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 7660f7f3ccb5a7..902110f10894fa 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -132,6 +132,14 @@ and 255. "for optionally converting the image, can be \"unchanged\" " ",\"gray\" , \"rgb\" .") .SetDefault("unchanged"); + AddAttr( + "host_memory_padding", + "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding") + .SetDefault(0); + AddAttr( + "device_memory_padding", + "(int64, default 0), device memory allocation padding number for Nvjpeg decoding") + .SetDefault(0); AddAttr( "data_layout", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index 0f6ddef5d19939..3d8df3e975635b 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -27,7 +27,7 @@ namespace data { using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; using DataLayout = framework::DataLayout; -NvjpegDecoderThreadPool* decode_pool = nullptr; +ImageDecoderThreadPool* decode_pool = nullptr; template class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { @@ -37,11 +37,15 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); auto program_id = ctx.Attr("program_id"); + auto host_memory_padding = ctx.Attr("host_memory_padding"); + auto device_memory_padding = ctx.Attr("device_memory_padding"); // multi-phrase decode thread pool auto* decode_pool = - DecoderThreadPoolManager::Instance()->GetDecoderThreadPool( - program_id, num_threads, mode, local_rank); + ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( + program_id, num_threads, mode, local_rank, + static_cast(host_memory_padding), + static_cast(device_memory_padding)); const framework::LoDTensorArray* inputs = ctx.Input("X"); @@ -81,24 +85,24 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { size_t x_numel = static_cast(x.numel()); if (data_layout == DataLayout::kNCHW){ - NvjpegDecodeTask task = { + ImageDecodeTask task = { .bit_stream = x_data, .bit_len = x_numel, .tensor = &temp_array[i], .roi_generator = generators->at(i).get(), .place = dev }; - decode_pool->AddTask(std::make_shared(task)); + decode_pool->AddTask(std::make_shared(task)); } else{ - NvjpegDecodeTask task = { + ImageDecodeTask task = { .bit_stream = x_data, .bit_len = x_numel, .tensor = &out_array[i], .roi_generator = generators->at(i).get(), .place = dev }; - decode_pool->AddTask(std::make_shared(task)); + decode_pool->AddTask(std::make_shared(task)); } } @@ -109,11 +113,9 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const auto& dev_ctx = ctx.cuda_device_context(); paddle::operators::math::Transpose trans; std::vector axis = {2, 0, 1}; - // LOG(ERROR) << "start transpose 01!!!"; for (size_t i = 0; i < inputs->size(); i++) { // Do transpose const framework::DDim& in_sizes = temp_array[i].dims(); - // const int ndim = in_sizes.size(); framework::DDim transposed_input_shape = in_sizes.transpose(axis); std::vector transposed_input_shape_ = framework::vectorize(transposed_input_shape); diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h index de96e38ca95ef9..6e36498e576fb7 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/image_decoder.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc similarity index 65% rename from paddle/fluid/operators/data/nvjpeg_decoder.cc rename to paddle/fluid/operators/data/image_decoder.cc index 188723ba306fea..44b7d43669e336 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -12,29 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/image_decoder.h" namespace paddle { namespace operators { namespace data { -NvjpegDecoder::NvjpegDecoder(std::string mode, int dev_id) +ImageDecoder::ImageDecoder(std::string mode, int dev_id, + size_t host_memory_padding, + size_t device_memory_padding) : nvjpeg_streams_(2), pinned_buffers_(2), page_id_(0), mode_(mode) { platform::SetDeviceId(dev_id); - // create cuda stream - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking)); // create nvjpeg handle and stream - // device_allocator_.dev_malloc = &cudaMalloc; - // device_allocator_.dev_free = &cudaFree; - // pinned_allocator_.pinned_malloc = &cudaMallocHost; - // pinned_allocator_.pinned_free = &cudaFreeHost; PADDLE_ENFORCE_NVJPEG_SUCCESS( platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_HYBRID, &device_allocator_, &pinned_allocator_, 0, &handle_)); + + // set pinned/device memory padding + if (host_memory_padding > 0) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetPinnedMemoryPadding(host_memory_padding, handle_)); + } + if (device_memory_padding > 0) { + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetDeviceMemoryPadding(device_memory_padding, handle_)); + } + + // create nvjpeg stream for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); } @@ -51,9 +57,7 @@ NvjpegDecoder::NvjpegDecoder(std::string mode, int dev_id) } } -NvjpegDecoder::~NvjpegDecoder() { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); - +ImageDecoder::~ImageDecoder() { // destroy nvjpeg streams for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); @@ -69,27 +73,23 @@ NvjpegDecoder::~NvjpegDecoder() { PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); } - // destroy nvjpeg handle and cuda stream at last + // destroy nvjpeg handle at last PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(cuda_stream_)); } -// cv::Mat DecodeRandomCropResize(const unsigned char* data, size_t length, -// RandomROIGenerator* roi_generator, -// unsigned char* workspace, size_t workspace_size, -// unsigned char* dst, int target_width, -// int target_height) { +void ImageDecoder::CPUDecodeRandomCrop( + const uint8_t* data, size_t length, + RandomROIGenerator* roi_generator, + unsigned char* workspace, size_t workspace_size, + framework::LoDTensor* out, platform::Place place) { + VLOG(4) << "CPUDecodeRandomCropResize enter"; #ifdef PADDLE_WITH_OPENCV -void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length, - RandomROIGenerator* roi_generator, - unsigned char* workspace, size_t workspace_size, - framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place) { cv::Mat image = cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); - cv::Mat cropped; - int height; - int width; + auto* image_data = image.data; + int height = image.rows; + int width = image.cols; if (roi_generator) { ROI roi; roi_generator->GenerateRandomROI(image.cols, image.rows, &roi); @@ -101,26 +101,28 @@ void NvjpegDecoder::CPUDecodeRandomCropResize(const uint8_t* data, size_t length height = roi.h; width = roi.w; - std::vector out_shape = {height, width, 3}; - temp.Resize(framework::make_ddim(out_shape)); - platform::CPUPlace cpu; - // allocate memory and assign to out_image - auto* data = temp.mutable_data(cpu); // todo jianglielin: why not work? // cropped.data = data; + cv::Mat cropped; image(cv_roi).copyTo(cropped); + image_data = cropped.data; + } - std::memcpy(data, cropped.data, 3 * height * width); + framework::LoDTensor cpu_tensor; + std::vector out_shape = {height, width, 3}; + cpu_tensor.Resize(framework::make_ddim(out_shape)); + // allocate memory and assign to out_image + auto* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - TensorCopySync(temp, place, out); - } else { - LOG(ERROR) << "Not Use Opencv decode!!!"; - // throw error - } -} + std::memcpy(cpu_data, image_data, 3 * height * width); + TensorCopySync(cpu_tensor, place, out); +#else + PADDLE_THROW(platform::errors::Fatal( + "Nvjpeg decode failed and Paddle is not compiled with OpenCV")); #endif +} -int NvjpegDecoder::ParseDecodeParams( +nvjpegStatus_t ImageDecoder::ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, platform::Place place) { @@ -132,20 +134,8 @@ int NvjpegDecoder::ParseDecodeParams( nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, &components, &subsampling, widths, heights); - // PADDLE_ENFORCE_NVJPEG_SUCCESS( - // platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, - // &components, &subsampling, widths, heights)); - if (status != NVJPEG_STATUS_SUCCESS || (components != 3 && components != 1)) { -#ifdef PADDLE_WITH_OPENCV - framework::LoDTensor temp; - CPUDecodeRandomCropResize(bit_stream, bit_len, roi_generator, nullptr, 0, temp, out, place); - return 1; -#endif - } - else{ - // LOG(ERROR) << "Use nvjpeg decode!!!"; - } + if (status != NVJPEG_STATUS_SUCCESS) return status; int64_t width = static_cast(widths[0]); int64_t height = static_cast(heights[0]); @@ -162,7 +152,8 @@ int NvjpegDecoder::ParseDecodeParams( output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); + "The provided mode(%s) does not support components(%d)", + mode_, components)); } } else if (mode_ == "gray") { output_format = NVJPEG_OUTPUT_Y; @@ -171,7 +162,6 @@ int NvjpegDecoder::ParseDecodeParams( output_format = NVJPEG_OUTPUT_RGBI; output_components = 3; } else { - // std::cout << mode_ << std::endl; PADDLE_THROW(platform::errors::Fatal( "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); } @@ -194,42 +184,46 @@ int NvjpegDecoder::ParseDecodeParams( auto* data = out->mutable_data(place); out_image->channel[0] = data; out_image->pitch[0] = output_components * width; - return 0; + + return NVJPEG_STATUS_SUCCESS; } -void NvjpegDecoder::Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { +nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { auto buffer = pinned_buffers_[page_id_]; auto stream = nvjpeg_streams_[page_id_]; page_id_ ^= 1; // decode jpeg in host to pinned buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); - - (platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); + nvjpegStatus_t status = platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream); + if (status != NVJPEG_STATUS_SUCCESS) return status; // transfer and decode to device buffer PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, cuda_stream_)); - - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(cuda_stream_)); + status = platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, nullptr); + return status; } -void NvjpegDecoder::Run( +void ImageDecoder::Run( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, platform::Place& place) { nvjpegImage_t image; - int res = ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); - if (res) { + nvjpegStatus_t status = ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); + if (status != NVJPEG_STATUS_SUCCESS) { + CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place); return; } - // LOG(ERROR) << "ParseDecodeParams finish !!!"; - Decode(bit_stream, bit_len, &image); - // LOG(ERROR) << "Decode finish !!!"; + status = GPUDecodeRandomCrop(bit_stream, bit_len, &image); + if (status != NVJPEG_STATUS_SUCCESS) { + CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place); + } } -NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id) +ImageDecoderThreadPool::ImageDecoderThreadPool( + const int num_threads, const std::string mode, const int dev_id, + const size_t host_memory_padding, const size_t device_memory_padding) : threads_(num_threads), mode_(mode), dev_id_(dev_id), @@ -242,17 +236,18 @@ NvjpegDecoderThreadPool::NvjpegDecoderThreadPool(const int num_threads, const st "but got %d", num_threads)); for (int i = 0; i < num_threads; i++) { threads_.emplace_back( - std::thread(std::bind(&NvjpegDecoderThreadPool::ThreadLoop, this, i))); + std::thread(std::bind(&ImageDecoderThreadPool::ThreadLoop, + this, i, host_memory_padding, device_memory_padding))); } } -NvjpegDecoderThreadPool::~NvjpegDecoderThreadPool() { ShutDown(); } +ImageDecoderThreadPool::~ImageDecoderThreadPool() { ShutDown(); } -void NvjpegDecoderThreadPool::AddTask(std::shared_ptr task) { +void ImageDecoderThreadPool::AddTask(std::shared_ptr task) { task_queue_.push_back(task); } -void NvjpegDecoderThreadPool::RunAll(const bool wait, const bool sort) { +void ImageDecoderThreadPool::RunAll(const bool wait, const bool sort) { // Sort images in length desending order if (sort) SortTaskByLengthDescend(); @@ -266,13 +261,13 @@ void NvjpegDecoderThreadPool::RunAll(const bool wait, const bool sort) { if (wait) WaitTillTasksCompleted(); } -void NvjpegDecoderThreadPool::WaitTillTasksCompleted() { +void ImageDecoderThreadPool::WaitTillTasksCompleted() { std::unique_lock lock(mutex_); completed_cond_.wait(lock, [this] { return this->completed_; }); running_ = false; } -void NvjpegDecoderThreadPool::ShutDown() { +void ImageDecoderThreadPool::ShutDown() { std::unique_lock lock(mutex_); running_ = false; shutdown_ = true; @@ -286,23 +281,25 @@ void NvjpegDecoderThreadPool::ShutDown() { } } -void NvjpegDecoderThreadPool::SortTaskByLengthDescend() { +void ImageDecoderThreadPool::SortTaskByLengthDescend() { std::lock_guard lock(mutex_); std::sort(task_queue_.begin(), task_queue_.end(), - [](const std::shared_ptr a, - const std::shared_ptr b) { + [](const std::shared_ptr a, + const std::shared_ptr b) { return b->bit_len < a->bit_len; }); } -void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { - NvjpegDecoder* decoder = new NvjpegDecoder(mode_, dev_id_); +void ImageDecoderThreadPool::ThreadLoop( + const int thread_idx, const size_t host_memory_padding, + const size_t device_memory_padding) { + ImageDecoder* decoder = new ImageDecoder(mode_, dev_id_, + host_memory_padding, + device_memory_padding); while (!shutdown_) { std::unique_lock lock(mutex_); - // LOG(ERROR) << "ThreadLoop wait running_cond_"; running_cond_.wait(lock, [this] { return (running_ && !task_queue_.empty()) || shutdown_; }); - // LOG(ERROR) << "ThreadLoop shutdown_ " << shutdown_; if (shutdown_) break; auto task = task_queue_.front(); @@ -323,9 +320,9 @@ void NvjpegDecoderThreadPool::ThreadLoop(const int thread_idx) { } } -// initialization static variables out of MapRunnerManager -DecoderThreadPoolManager* DecoderThreadPoolManager::pm_instance_ptr_ = nullptr; -std::mutex DecoderThreadPoolManager::m_; +// initialization static variables out of ImageDecoderThreadPoolManager +ImageDecoderThreadPoolManager* ImageDecoderThreadPoolManager::pm_instance_ptr_ = nullptr; +std::mutex ImageDecoderThreadPoolManager::m_; } // namespace data } // namespace operators diff --git a/paddle/fluid/operators/data/nvjpeg_decoder.h b/paddle/fluid/operators/data/image_decoder.h similarity index 60% rename from paddle/fluid/operators/data/nvjpeg_decoder.h rename to paddle/fluid/operators/data/image_decoder.h index dca7b45c0b88d3..3a42db61103259 100644 --- a/paddle/fluid/operators/data/nvjpeg_decoder.h +++ b/paddle/fluid/operators/data/image_decoder.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #ifdef PADDLE_WITH_OPENCV #include @@ -34,12 +35,18 @@ namespace operators { namespace data { static int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); } -static int dev_free(void *p) { return (int)cudaFree(p); } -static int host_malloc(void** p, size_t s, unsigned int f) { return (int)cudaHostAlloc(p, s, f); } -static int host_free(void* p) { return (int)cudaFreeHost(p); } +static int dev_free(void *p) { return (int)cudaFree(p); } -struct NvjpegDecodeTask { +static int host_malloc(void** p, size_t s, unsigned int f) { + return (int)cudaHostAlloc(p, s, f); +} + +static int host_free(void* p) { + return (int)cudaFreeHost(p); +} + +struct ImageDecodeTask { const uint8_t* bit_stream; size_t bit_len; framework::LoDTensor* tensor; @@ -47,29 +54,31 @@ struct NvjpegDecodeTask { platform::Place place; }; -class NvjpegDecoder { +class ImageDecoder { public: - NvjpegDecoder(const std::string mode, int dev_id); + ImageDecoder(const std::string mode, int dev_id, + size_t host_memory_padding=0, + size_t device_memory_padding=0); - ~NvjpegDecoder(); + ~ImageDecoder(); void Run(const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, platform::Place& place); private: - DISABLE_COPY_AND_ASSIGN(NvjpegDecoder); -#ifdef PADDLE_WITH_OPENCV - void CPUDecodeRandomCropResize(const uint8_t* data, size_t length, - RandomROIGenerator* roi_generator, - unsigned char* workspace, size_t workspace_size, - framework::LoDTensor& temp, framework::LoDTensor* out, platform::Place place); -#endif - int ParseDecodeParams( + DISABLE_COPY_AND_ASSIGN(ImageDecoder); + + void CPUDecodeRandomCrop(const uint8_t* data, size_t length, + RandomROIGenerator* roi_generator, + unsigned char* workspace, size_t workspace_size, + framework::LoDTensor* out, platform::Place place); + + nvjpegStatus_t ParseDecodeParams( const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, nvjpegImage_t* out_image, platform::Place place); - void Decode(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image); + nvjpegStatus_t GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image); cudaStream_t cuda_stream_ = nullptr; @@ -90,13 +99,15 @@ class NvjpegDecoder { const std::string mode_; }; -class NvjpegDecoderThreadPool { +class ImageDecoderThreadPool { public: - NvjpegDecoderThreadPool(const int num_threads, const std::string mode, const int dev_id); + ImageDecoderThreadPool(const int num_threads, const std::string mode, + const int dev_id, size_t host_memory_padding, + size_t device_memory_padding); - ~NvjpegDecoderThreadPool(); + ~ImageDecoderThreadPool(); - void AddTask(std::shared_ptr task); + void AddTask(std::shared_ptr task); void RunAll(const bool wait, const bool sort = true); @@ -105,17 +116,18 @@ class NvjpegDecoderThreadPool { void ShutDown(); private: - DISABLE_COPY_AND_ASSIGN(NvjpegDecoderThreadPool); + DISABLE_COPY_AND_ASSIGN(ImageDecoderThreadPool); void SortTaskByLengthDescend(); - void ThreadLoop(const int thread_idx); + void ThreadLoop(const int thread_idx, const size_t host_memory_padding, + const size_t device_memory_padding); std::vector threads_; std::string mode_; int dev_id_; - std::deque> task_queue_; + std::deque> task_queue_; std::mutex mutex_; bool shutdown_; @@ -127,34 +139,39 @@ class NvjpegDecoderThreadPool { int outstand_tasks_; }; -class DecoderThreadPoolManager { +class ImageDecoderThreadPoolManager { private: - DISABLE_COPY_AND_ASSIGN(DecoderThreadPoolManager); + DISABLE_COPY_AND_ASSIGN(ImageDecoderThreadPoolManager); - static DecoderThreadPoolManager *pm_instance_ptr_; + static ImageDecoderThreadPoolManager *pm_instance_ptr_; static std::mutex m_; - std::map> prog_id_to_pool_; + std::map> prog_id_to_pool_; public: - static DecoderThreadPoolManager* Instance() { + static ImageDecoderThreadPoolManager* Instance() { if (pm_instance_ptr_ == nullptr) { std::lock_guard lk(m_); if (pm_instance_ptr_ == nullptr) { - pm_instance_ptr_ = new DecoderThreadPoolManager; + pm_instance_ptr_ = new ImageDecoderThreadPoolManager; } } return pm_instance_ptr_; } - NvjpegDecoderThreadPool* GetDecoderThreadPool( + ImageDecoderThreadPool* GetDecoderThreadPool( const int64_t program_id, const int num_threads, - const std::string mode, const int dev_id) { + const std::string mode, const int dev_id, + const size_t host_memory_padding, + const size_t device_memory_padding) { auto iter = prog_id_to_pool_.find(program_id); if (iter == prog_id_to_pool_.end()) { + LOG(ERROR) << "GetDecoderThreadPool new"; prog_id_to_pool_[program_id] = - std::unique_ptr( - new NvjpegDecoderThreadPool(num_threads, mode, dev_id)); + std::unique_ptr( + new ImageDecoderThreadPool(num_threads, mode, dev_id, + host_memory_padding, + device_memory_padding)); } return prog_id_to_pool_[program_id].get(); } @@ -177,9 +194,11 @@ class DecoderThreadPoolManager { } } - DecoderThreadPoolManager() { VLOG(1) << "DecoderThreadPoolManager init"; } + ImageDecoderThreadPoolManager() { + VLOG(1) << "ImageDecoderThreadPoolManager init"; + } - ~DecoderThreadPoolManager() { + ~ImageDecoderThreadPoolManager() { VLOG(1) << "~DecoderThreadPoolManager"; ShutDown(); } diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 70f26457e963ee..56097316b22a16 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -28,7 +28,6 @@ class MapOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext* ctx) const { - OP_INOUT_CHECK(ctx->HasInputs("In"), "Input", "In", "MapOp"); OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); } @@ -42,7 +41,6 @@ class MapOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - // LOG(ERROR) << "MapOpKernel RunImpl enter"; // Step1: get output vars and attrs auto inputs = Inputs("In"); std::vector input_vars; @@ -72,14 +70,12 @@ class MapOp : public framework::OperatorBase { map_block, program_id, &scope, dev_place, input_var_names, output_var_names, input_queues, output_queues); - // LOG(ERROR) << "MapOpKernel RunImpl finish"; } }; class MapInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("In"), "Input", "In", "MapOp"); OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MapOp"); } }; diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cu b/paddle/fluid/operators/data/mirror_normalize_op.cu index 9c62e41b5af462..40ece36dbafa5b 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cu +++ b/paddle/fluid/operators/data/mirror_normalize_op.cu @@ -42,7 +42,6 @@ template class MirrorNormalizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "MirrorNormalizeCUDAKernel Compute start"; auto* x = ctx.Input("X"); auto* mirror = ctx.Input("Mirror"); auto* out = ctx.Output("Out"); @@ -81,7 +80,6 @@ class MirrorNormalizeCUDAKernel : public framework::OpKernel { KeMirrorNormalize<<>>( numel, x_data, mirror_data, out_data, mean_data, std_data, chw, hw, w); - LOG(ERROR) << "MirrorNormalizeCUDAKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/random_flip_op.h b/paddle/fluid/operators/data/random_flip_op.h index f1564fc2ed4521..bbf38806de1e52 100644 --- a/paddle/fluid/operators/data/random_flip_op.h +++ b/paddle/fluid/operators/data/random_flip_op.h @@ -65,7 +65,6 @@ template class RandomFlipCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "RandomFlipCPUKernel enter"; const Tensor* x = ctx.Input("X"); Tensor* out = ctx.Output("Out"); @@ -77,7 +76,6 @@ class RandomFlipCPUKernel : public framework::OpKernel { for (int64_t i = 0; i < x->dims()[0]; i++) { data[i] = generator->Generate(); } - LOG(ERROR) << "RandomFlipCPUKernel finish"; } }; diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/shutdown.h index 15ac57d3bdd5f8..d3eaa94d3feeee 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/shutdown.h @@ -14,7 +14,7 @@ #pragma once #include "paddle/fluid/operators/data/data_reader_op.h" -#include "paddle/fluid/operators/data/nvjpeg_decoder.h" +#include "paddle/fluid/operators/data/image_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" #include "paddle/fluid/operators/data/pipeline.h" @@ -23,25 +23,25 @@ namespace paddle { namespace operators { namespace data { -extern NvjpegDecoderThreadPool* decode_pool; +extern ImageDecoderThreadPool* decode_pool; void ShutDownAllDataLoaders() { - LOG(ERROR) << "ShutDownAllDataLoaders enter"; + VLOG(4) << "ShutDownAllDataLoaders enter"; // step 1: shutdown reader ReaderManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownAllDataLoaders reader_wrapper shutdown finish"; + // LOG(ERROR) << "ShutDownAllDataLoaders reader_wrapper shutdown finish"; // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); - LOG(ERROR) << "ShutDownAllDataLoaders decode_pool shutdown finish"; + // LOG(ERROR) << "ShutDownAllDataLoaders decode_pool shutdown finish"; // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownAllDataLoaders MapRunner shutdown finish"; + // LOG(ERROR) << "ShutDownAllDataLoaders MapRunner shutdown finish"; // step 3: shutdown Pipeline PipelineManager::Instance()->ShutDown(); - LOG(ERROR) << "ShutDownAllDataLoaders Pipeline shutdown finish"; + VLOG(4) << "ShutDownAllDataLoaders Pipeline shutdown finish"; } void ShutDownReadersAndDecoders(const int64_t program_id) { @@ -50,7 +50,7 @@ void ShutDownReadersAndDecoders(const int64_t program_id) { ReaderManager::Instance()->ShutDownReader(program_id); // step 2: shutdown decoder - DecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); + ImageDecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); LOG(ERROR) << "ShutDownReadersAndDecoders finish"; } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 30ebe88acfe9d6..e6ee1818a49c5f 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -200,20 +200,9 @@ platform::DeviceContext* AsyncDeviceContextPool::Get(const platform::Place& plac return device_contexts_[place][stream_id].get(); } else { auto* dev_ctx = new CUDADeviceContext(BOOST_GET_CONST(CUDAPlace, place)); - // LOG(ERROR) << "craete dev_ctx " << dev_ctx << " with stream " << dev_ctx->stream(); device_contexts_[place].emplace(stream_id, std::unique_ptr(dev_ctx)); return dev_ctx; } - // auto stream_map = place_it->second; - // auto stream_it = stream_map.find(stream_id); - // if (stream_it == stream_map.end()) { - // // auto dev_ctx = std::unique_ptr(new CUDADeviceContext(BOOST_GET_CONST(CUDAPlace, place))); - // // stream_map.emplace(stream_id, dev_ctx); - // // return dev_ctx.get(); - // } else { - // // return stream_it->second.get(); - // } - // return nullptr; } AsyncDeviceContextPool::AsyncDeviceContextPool( diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index f9e42c83586c54..b06d291e9c14af 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -40,6 +40,8 @@ extern void *nvjpeg_dso_handle; #define NVJPEG_RAND_ROUTINE_EACH(__macro) \ __macro(nvjpegCreateSimple); \ __macro(nvjpegCreateEx); \ + __macro(nvjpegSetDeviceMemoryPadding); \ + __macro(nvjpegSetPinnedMemoryPadding); \ __macro(nvjpegJpegStateCreate); \ __macro(nvjpegJpegStreamCreate); \ __macro(nvjpegDecodeParamsCreate); \ diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 0916d4e00d0294..456747b5a71121 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -60,7 +60,7 @@ def _generate_stream_id(): return _stream_id_generator.get_stream_id() -def map(map_func, inputs): +def map(map_func, inputs=[]): inputs = _to_list(inputs) if in_dygraph_mode(): return map_func(*inputs) @@ -100,7 +100,7 @@ def map(map_func, inputs): stream_id = _generate_stream_id() for idx in range(map_block.desc.op_size()): - map_block.desc.op(idx)._set_attr('stream_id', stream_id) + map_block.desc.op(idx)._set_attr('_stream_id', stream_id) helper.append_op( type="map", diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 71e2f790ecc5cb..f992d0d1e038ff 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -867,7 +867,9 @@ def read_file(filename, name=None): return out -def image_decode(x, mode='unchanged', num_threads=2, name=None): +def image_decode(x, mode='unchanged', num_threads=2, + host_memory_padding=0, device_memory_padding=0, + name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. @@ -910,13 +912,17 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode( x, out, "mode", mode, "num_threads", num_threads, - "local_rank", local_rank, "program_id", program_id) + "local_rank", local_rank, "program_id", program_id, + "host_memory_padding", host_memory_padding, + "device_memory_padding", device_memory_padding) inputs = {'X': x} attrs = {"mode": mode, "num_threads": num_threads, "local_rank": local_rank, - "program_id": utils._hash_with_id(default_main_program())} + "program_id": utils._hash_with_id(default_main_program()), + "host_memory_padding": host_memory_padding, + "device_memory_padding": device_memory_padding} helper = LayerHelper("batch_decode", **locals()) out = helper.create_variable( @@ -932,6 +938,8 @@ def image_decode(x, mode='unchanged', num_threads=2, name=None): def image_decode_random_crop(x, mode='unchanged', num_threads=2, + host_memory_padding=0, + device_memory_padding=0, data_layout='NCHW', aspect_ratio_min=3./4., aspect_ratio_max=4./3., @@ -983,16 +991,20 @@ def image_decode_random_crop(x, core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode_random_crop( - x, out, "mode", mode, "num_threads", num_threads, "data_layout", data_layout, - "aspect_ratio_min", aspect_ratio_min, - "aspect_ratio_max", aspect_ratio_max, + x, out, "mode", mode, "num_threads", num_threads, + "data_layout", data_layout, "aspect_ratio_min", + aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, "area_max", area_max, "num_attempts", num_attempts, "local_rank", local_rank, - "program_id", program_id) + "program_id", program_id, + "host_memory_padding", host_memory_padding, + "device_memory_padding", device_memory_padding) inputs = {'X': x} attrs = {"mode": mode, "num_threads": num_threads, + "host_memory_padding": host_memory_padding, + "device_memory_padding": device_memory_padding, "data_layout": data_layout, "aspect_ratio_min": aspect_ratio_min, "aspect_ratio_max": aspect_ratio_max, @@ -1018,20 +1030,20 @@ def random_flip(x, prob=0.5, name=None): if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") - # rand_vec = layers.uniform_random_batch_size_like( - # x, [1, 1], min=0., max=1.) - # return rand_vec < prob - helper = LayerHelper("random_flip", **locals()) - out = helper.create_variable( - name=unique_name.generate("random_flip"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype=core.VarDesc.VarType.BOOL) - helper.append_op( - type="random_flip", - inputs={"X": x}, - outputs={"Out": out}, - attrs={"probability": prob}) - return out + rand_vec = layers.uniform_random_batch_size_like( + x, [1, 1], min=0., max=1.) + return rand_vec < prob + # helper = LayerHelper("random_flip", **locals()) + # out = helper.create_variable( + # name=unique_name.generate("random_flip"), + # type=core.VarDesc.VarType.LOD_TENSOR, + # dtype=core.VarDesc.VarType.BOOL) + # helper.append_op( + # type="random_flip", + # inputs={"X": x}, + # outputs={"Out": out}, + # attrs={"probability": prob}) + # return out def mirror_normalize(x, mirror, From 018451afe8fcb3e209b95090eb02342ebbb3db89 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 23 Mar 2022 14:54:58 +0000 Subject: [PATCH 70/95] change label data type to int64 --- paddle/fluid/operators/data/file_label_loader_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index eef07790372a24..d3b7fc611fa397 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -369,7 +369,7 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { image_arr->reserve(batch_size); label_tensor->Resize( framework::make_ddim({static_cast(batch_size)})); - auto* label_data = label_tensor->mutable_data(platform::CPUPlace()); + auto* label_data = label_tensor->mutable_data(platform::CPUPlace()); for (int64_t i = 0; i < batch_size; i++) { int64_t index = static_cast(indices_data[i]); auto file = samples->at(index).first; @@ -389,7 +389,7 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { input.read(reinterpret_cast(data), file_size); image_arr->emplace_back(image); - label_data[i] = label; + label_data[i] = static_cast(label); } LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; From e3964706e73a921ea9b0a544cd471aff473f5ca3 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 25 Mar 2022 09:39:07 +0000 Subject: [PATCH 71/95] add opencv cvtColor --- paddle/fluid/operators/data/image_decoder.cc | 15 ++++++++------- paddle/fluid/operators/data/image_decoder.h | 1 - 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc index 44b7d43669e336..168f501b717839 100644 --- a/paddle/fluid/operators/data/image_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -87,7 +87,7 @@ void ImageDecoder::CPUDecodeRandomCrop( cv::Mat image = cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); - auto* image_data = image.data; + cv::Mat cropped; int height = image.rows; int width = image.cols; if (roi_generator) { @@ -101,20 +101,21 @@ void ImageDecoder::CPUDecodeRandomCrop( height = roi.h; width = roi.w; - // todo jianglielin: why not work? - // cropped.data = data; - cv::Mat cropped; image(cv_roi).copyTo(cropped); - image_data = cropped.data; + } else { + cropped = image; } + // allocate cpu tensor and memory framework::LoDTensor cpu_tensor; std::vector out_shape = {height, width, 3}; cpu_tensor.Resize(framework::make_ddim(out_shape)); - // allocate memory and assign to out_image auto* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::memcpy(cpu_data, image_data, 3 * height * width); + cv::Mat cpu_mat(height, width, CV_8UC3, const_cast(cpu_data), cv::Mat::AUTO_STEP); + cv::cvtColor(cropped, cpu_mat, cv::COLOR_BGR2RGB); + + // copy cpu tensor to output gpu tensor TensorCopySync(cpu_tensor, place, out); #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/operators/data/image_decoder.h b/paddle/fluid/operators/data/image_decoder.h index 3a42db61103259..065f8b7f5cef20 100644 --- a/paddle/fluid/operators/data/image_decoder.h +++ b/paddle/fluid/operators/data/image_decoder.h @@ -166,7 +166,6 @@ class ImageDecoderThreadPoolManager { const size_t device_memory_padding) { auto iter = prog_id_to_pool_.find(program_id); if (iter == prog_id_to_pool_.end()) { - LOG(ERROR) << "GetDecoderThreadPool new"; prog_id_to_pool_[program_id] = std::unique_ptr( new ImageDecoderThreadPool(num_threads, mode, dev_id, From e5bed90f70b7c14c51a8d75d387ef2004cb407f7 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 27 Mar 2022 14:53:07 +0000 Subject: [PATCH 72/95] add Pipeline.reset() --- paddle/fluid/operators/data/data_reader_op.h | 54 ++++++++++++++++--- paddle/fluid/operators/data/dataloader_op.h | 4 +- paddle/fluid/operators/data/map_runner.cc | 24 +++++++-- paddle/fluid/operators/data/map_runner.h | 19 +++++-- paddle/fluid/operators/data/pipeline.h | 9 ++++ .../operators/data/{shutdown.h => utils.h} | 15 ++++++ paddle/fluid/pybind/pybind.cc | 9 +++- python/paddle/fluid/core.py | 2 + python/paddle/fluid/dataloader/pipeline.py | 16 ++++-- 9 files changed, 133 insertions(+), 19 deletions(-) rename paddle/fluid/operators/data/{shutdown.h => utils.h} (85%) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index b6e166bc3e2ae1..2f79c22728ff21 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -45,7 +45,7 @@ class Sampler { const int rank, const int world_size) : current_iter_(0), batch_size_(batch_size), - // num_samples_(num_samples), + shuffle_(shuffle), drop_last_(drop_last), rank_(rank), world_size_(world_size) { @@ -96,9 +96,18 @@ class Sampler { } } + void Reset() { + if (shuffle_) { + std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); + } + + current_iter_ = 0; + } + private: int64_t current_iter_; const int64_t batch_size_; + const bool shuffle_; int64_t num_samples_; const bool drop_last_; const int rank_; @@ -123,6 +132,7 @@ class DataReader { const int rank, const int world_size) : running_(true), + shutdown_(false), reader_block_(reader_block), place_(place), indices_var_name_(indices_var_name), @@ -142,7 +152,12 @@ class DataReader { reader_thread_ = std::thread([this, scope] { auto& scope_ = scope->NewScope(); framework::Executor executor(place_); - while (running_.load()) { + while (!shutdown_) { + // check running or shutdown + std::unique_lock lock(mutex_); + running_cond_.wait(lock, [this] { return running_ || shutdown_; }); + if (shutdown_) break; + std::vector indices; sampler_.GetNextIndices(&indices); // shutdown reader if indices drained @@ -153,8 +168,8 @@ class DataReader { queue->Close(); } - running_.store(false); - return; + running_ = false; + continue; } ShareIndicesIntoScope(&scope_, indices); @@ -198,10 +213,25 @@ class DataReader { if (queue && !queue->IsClosed()) queue->Close(); } - running_.store(false); + shutdown_ = true; + running_ = false; + running_cond_.notify_all(); + if (reader_thread_.joinable()) reader_thread_.join(); } + void Reset() { + // reopen all output queues + for (auto& queue: output_queues_) queue->ReOpen(); + + // reset sampler to regenerate indices + sampler_.Reset(); + + // set running flag to reset running + running_ = true; + running_cond_.notify_all(); + } + void ShareIndicesIntoScope(Scope* scope, std::vector indices) { auto* var = scope->Var(indices_var_name_); @@ -216,7 +246,11 @@ class DataReader { } private: - std::atomic running_; + bool running_; + std::condition_variable running_cond_; + bool shutdown_; + std::mutex mutex_; + std::thread reader_thread_; BlockDesc* reader_block_; @@ -283,6 +317,14 @@ class ReaderManager { id_to_reader_.erase(reader_id); } } + + void ResetReader(const int64_t reader_id) { + auto iter = id_to_reader_.find(reader_id); + if (iter != id_to_reader_.end()) { + iter->second->Reset(); + } + } + void ShutDown() { auto iter = id_to_reader_.begin(); while (iter != id_to_reader_.end()){ diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index 20d491442bdc39..c4bf872207fb73 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -39,8 +39,8 @@ class DataLoaderOpKernel : public framework::OpKernel { pipeline->ReadNext(output_vars); if (!pipeline->IsRunning()) { - LOG(ERROR) << "DataLoaderOpKernel Pipeline not running"; - data::PipelineManager::Instance()->ShutDownPipeline(program_id); + LOG(ERROR) << "DataLoaderOpKernel Pipeline not running, throw EOF"; + // data::PipelineManager::Instance()->ShutDownPipeline(program_id); throw platform::EOFException("DataLoaderOpKernel epoch end", __FILE__, __LINE__); } diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index e1b39f44adef67..84d85ed139c377 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -29,6 +29,7 @@ MapRunner::MapRunner( const std::vector> output_queues) : thread_pool_(1), running_(true), + shutdown_(false), map_block_(map_block), program_id_(program_id), place_(place), @@ -124,7 +125,12 @@ void MapRunner::StartMapThread(const Scope* scope) { auto& scope_ = scope->NewScope(); framework::Executor executor(place_); - while (running_.load()) { + while (!shutdown_) { + // check running or shutdown + std::unique_lock lock(mutex_); + running_cond_.wait(lock, [this] { return running_ || shutdown_; }); + if (shutdown_) break; + // Step 1: get input LoDTensor and share into Scope // LOG(ERROR) << "MapThread Loop " << program_id_ << " start"; bool success = ShareInputsIntoScope(&scope_); @@ -133,8 +139,8 @@ void MapRunner::StartMapThread(const Scope* scope) { while(queue->Size()) sleep(0.5); queue->Close(); } - running_.store(false); - return; + running_ = false; + continue; } // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; @@ -204,13 +210,23 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, void MapRunner::ShutDown() { VLOG(1) << "MapRunner shutdown " << program_id_; // close all output queue, op after this op can shutdown itself - running_.store(false); + shutdown_ = true; + running_ = false; + running_cond_.notify_all(); for (auto queue : output_queues_) { if(queue && !queue->IsClosed()) queue->Close(); } } +void MapRunner::Reset() { + VLOG(1) << "MapRunner reset " << program_id_; + for (auto queue : output_queues_) queue->ReOpen(); + + running_ = true; + running_cond_.notify_all(); +} + // initialization static variables out of MapRunnerManager MapRunnerManager *MapRunnerManager::pm_instance_ptr_ = nullptr; std::mutex MapRunnerManager::m_; diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index f7f758e0abbfd8..fe951ae84256e3 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -51,7 +51,9 @@ class MapRunner { void ShutDown(); - inline bool IsRunning() { return running_.load(); } + void Reset(); + + inline bool IsRunning() { return running_; } private: @@ -71,7 +73,10 @@ class MapRunner { void CheckOutputVarStatus(const Variable &var, const std::string &var_name); ThreadPool thread_pool_; - std::atomic running_; + bool running_; + std::condition_variable running_cond_; + bool shutdown_; + std::mutex mutex_; std::shared_ptr map_block_; int64_t program_id_; @@ -121,14 +126,22 @@ class MapRunnerManager { } void ShutDownMapRunner(int program_id) { + std::lock_guard lk(m_); auto iter = prog_id_to_runner_.find(program_id); if (iter != prog_id_to_runner_.end()) { - std::lock_guard lk(m_); iter->second.get()->ShutDown(); prog_id_to_runner_.erase(iter); } } + void ResetMapRunner(int program_id) { + std::lock_guard lk(m_); + auto iter = prog_id_to_runner_.find(program_id); + if (iter != prog_id_to_runner_.end()) { + iter->second.get()->Reset(); + } + } + void ShutDown() { if (prog_id_to_runner_.empty()) return; diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 5d651a8e4927ee..8215473bfcb3bf 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -47,6 +47,8 @@ class Pipeline { inline bool IsRunning() { return running_.load(); } + void Reset() { running_.store(true); } + private: void CheckOutputVarStatus(const Variable &var, const std::string &var_name); @@ -113,6 +115,13 @@ class PipelineManager { prog_id_to_pipeline_.erase(program_id); } + void ResetPipeline(int64_t program_id) { + auto iter = prog_id_to_pipeline_.find(program_id); + if (iter != prog_id_to_pipeline_.end()) { + iter->second.get()->Reset(); + } + } + void ShutDown() { prog_id_to_pipeline_.clear(); } diff --git a/paddle/fluid/operators/data/shutdown.h b/paddle/fluid/operators/data/utils.h similarity index 85% rename from paddle/fluid/operators/data/shutdown.h rename to paddle/fluid/operators/data/utils.h index d3eaa94d3feeee..f043574fa77d50 100644 --- a/paddle/fluid/operators/data/shutdown.h +++ b/paddle/fluid/operators/data/utils.h @@ -68,6 +68,21 @@ void ShutDownPipeline(const int64_t program_id) { LOG(ERROR) << "ShutDownPipeline program_id " << program_id << " finish"; } +void ResetDataLoader(const int64_t reader_id, + const std::vector map_ids, + const int64_t pipeline_id) { + // step 1: reset readers + ReaderManager::Instance()->ResetReader(reader_id); + + // step 2: reset maps + for (auto& map_id : map_ids) { + MapRunnerManager::Instance()->ResetMapRunner(map_id); + } + + // step3: reset pipeline + PipelineManager::Instance()->ResetPipeline(pipeline_id); +} + } // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c1a9068371ef75..b50a924919f55c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -71,7 +71,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/fluid/operators/py_func_op.h" -#include "paddle/fluid/operators/data/shutdown.h" +#include "paddle/fluid/operators/data/utils.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device/device_wrapper.h" @@ -777,6 +777,13 @@ PYBIND11_MODULE(core_noavx, m) { }); m.def("_shutdown_pipeline", &paddle::operators::data::ShutDownPipeline); + m.def("_reset_dataloader", [](const int64_t reader_id, + const std::vector map_ids, + const int64_t pipeline_id) { + paddle::operators::data::ResetDataLoader( + reader_id, map_ids, pipeline_id); + + }); py::class_ custom_op_kernel_ctx( m, "CustomOpKernelContext", R"DOC()DOC"); diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 134f7761f360a0..e5f27b094fb8f2 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -285,6 +285,7 @@ def to_list(s): from .core_avx import _shutdown_readers_and_decoders from .core_avx import _shutdown_maps from .core_avx import _shutdown_pipeline + from .core_avx import _reset_dataloader if sys.platform != 'win32': from .core_avx import _set_process_pids from .core_avx import _erase_process_pids @@ -345,6 +346,7 @@ def to_list(s): from .core_noavx import _shutdown_readers_and_decoders from .core_noavx import _shutdown_maps from .core_noavx import _shutdown_pipeline + from .core_noavx import _reset_dataloder from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent if sys.platform != 'win32': from .core_noavx import _set_process_pids diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 309f2afdc9548a..14346a36bccc08 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -52,7 +52,7 @@ def __init__(self, queue_depth=2): if paddle.distributed.ParallelEnv().nranks > 1: paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) - paddle.distributed.init_parallel_env() + # paddle.distributed.init_parallel_env() def _init_programs(self): self._main_program = fluid.Program() @@ -102,11 +102,11 @@ def set_outputs(self, outputs): def build(self): global_block = self._main_program.desc.block(0) - program_id = _hash_with_id(self._main_program, self) + self._program_id = _hash_with_id(self._main_program, self) self._attrs = ('global_block', global_block, 'start_op_index', 0, 'end_op_index', global_block.op_size(), 'program_id', - program_id) + self._program_id) self._is_built = True def _prepare_output_vars(self): @@ -151,6 +151,16 @@ def __next__(self): def next(self): return self.__next__() + def reset(self): + reader_id= _hash_with_id(self._main_program) + + map_ids = [] + for op in self._main_program.block(0).ops: + if op.type == "map" and op.has_attr('program_id'): + map_ids.append(op.attr('program_id')) + + core._reset_dataloader(reader_id, map_ids, self._program_id) + def shutdown(self): if not self.is_shutdown: try: From 0065a28d263a3938e065d3075726f7dbab9f68be Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 27 Mar 2022 16:06:46 +0000 Subject: [PATCH 73/95] unique shuffle seed in data_reader on multi-device --- paddle/fluid/operators/data/data_reader_op.cc | 5 ++++- paddle/fluid/operators/data/data_reader_op.h | 14 +++++++++----- python/paddle/fluid/dataloader/ops.py | 4 +++- python/paddle/vision/reader.py | 6 ++++-- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 67eeb641911f60..4d5a81fc83648d 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -57,6 +57,7 @@ class DataReaderOp : public framework::OperatorBase { auto num_samples = Attr("num_samples"); auto shuffle = Attr("shuffle"); auto drop_last = Attr("drop_last"); + auto seed = Attr("seed"); auto rank = Attr("rank"); auto world_size = Attr("world_size"); auto indices_var_name = Attr("indices_var_name"); @@ -69,7 +70,7 @@ class DataReaderOp : public framework::OperatorBase { ReaderManager::Instance()->StartDataReader( reader_id, reader_block, &scope, platform::CPUPlace(), indices_var_name, output_var_names, output_queues, batch_size, num_samples, - shuffle, drop_last, rank, world_size); + shuffle, drop_last, seed, rank, world_size); } }; @@ -97,6 +98,8 @@ class DataReaderOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("drop_last", "Whether drop last incomplete batch") .SetDefault(false); + AddAttr("seed", "Random seed for shuffle") + .SetDefault(0); AddAttr("rank", "The logical rank of current device.") .SetDefault(0); AddAttr("world_size", "The number of running devices.") diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 2f79c22728ff21..62a93b59685060 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -42,7 +42,8 @@ class Sampler { public: explicit Sampler(const int64_t batch_size, const int64_t num_samples, const bool shuffle, const bool drop_last, - const int rank, const int world_size) + const int64_t seed, const int rank, + const int world_size) : current_iter_(0), batch_size_(batch_size), shuffle_(shuffle), @@ -67,7 +68,7 @@ class Sampler { num_samples_ = sample_ids_.size(); LOG(ERROR) << " Final num_samples " << num_samples_; if (shuffle) { - rnd_.seed(time(0)); + rnd_.seed(seed); std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); } } @@ -129,6 +130,7 @@ class DataReader { const int num_samples, const bool shuffle, const bool drop_last, + const int64_t seed, const int rank, const int world_size) : running_(true), @@ -140,7 +142,7 @@ class DataReader { output_queues_(output_queues), batch_size_(batch_size), sampler_(batch_size, num_samples, shuffle, - drop_last, rank, world_size) { + drop_last, seed, rank, world_size) { StartReaderThread(scope); } @@ -300,13 +302,15 @@ class ReaderManager { const std::vector &output_var_names, const std::vector> &output_queues, const int batch_size, const int num_samples, const bool shuffle, - const bool drop_last, const int rank, const int world_size) { + const bool drop_last, const int64_t seed, const int rank, + const int world_size) { auto iter = id_to_reader_.find(reader_id); if (iter == id_to_reader_.end()) { id_to_reader_[reader_id] = std::unique_ptr( new DataReader(reader_block, scope, place, indices_var_name, output_var_names, output_queues, batch_size, - num_samples, shuffle, drop_last, rank, world_size)); + num_samples, shuffle, drop_last, seed, + rank, world_size)); } } diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 1a3fe9657bb152..adfb0a8f23e07b 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -116,7 +116,8 @@ def data_reader(reader_func, batch_size=1, num_samples=1, shuffle=False, - drop_last=False): + drop_last=False, + seed=None): assert not in_dygraph_mode(), \ "paddle.io.data_reader can only be used in static mode" helper = LayerHelper("data_reader", **locals()) @@ -153,6 +154,7 @@ def data_reader(reader_func, "num_samples": num_samples, "shuffle": shuffle, "drop_last": drop_last, + "seed": 0 if seed is None else seed, "rank": paddle.distributed.get_rank(), "world_size": paddle.distributed.get_world_size() } diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 67f037adc65313..cd4e19d3761623 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -132,7 +132,8 @@ def file_label_loader(data_root, indices, name=None): def file_label_reader(file_root, batch_size=1, shuffle=False, - drop_last=False): + drop_last=False, + seed=None): """ Reads and outputs the bytes contents of a file as a uint8 Tensor with one dimension. @@ -178,7 +179,8 @@ def _reader(indices): batch_size=batch_size, num_samples=len(samples), shuffle=shuffle, - drop_last=drop_last) + drop_last=drop_last, + seed=seed) # inputs = dict() # attrs = { # 'root_dir': file_root, From be35dbb6b139d5a60a4f13520e3a01a52e05a352 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 28 Mar 2022 08:23:56 +0000 Subject: [PATCH 74/95] clean code. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 1 - .../fluid/operators/data/batch_decode_op.cu | 3 - .../data/batch_decode_random_crop_op.cc | 16 - .../data/batch_decode_random_crop_op.cu | 3 - .../fluid/operators/data/batch_resize_op.cu | 16 +- paddle/fluid/operators/data/data_reader_op.cc | 1 - paddle/fluid/operators/data/data_reader_op.h | 12 +- paddle/fluid/operators/data/data_scope.h | 280 ------------------ paddle/fluid/operators/data/dataloader_op.h | 6 +- .../operators/data/file_label_loader_op.cc | 84 ------ .../operators/data/file_label_loader_op.h | 260 +--------------- paddle/fluid/operators/data/map_op.cc | 1 - paddle/fluid/operators/data/map_runner.cc | 13 - paddle/fluid/operators/data/pipeline.cc | 2 +- .../data/random_crop_and_resize_op.cc | 5 - .../data/random_crop_and_resize_op.cu | 31 +- paddle/fluid/operators/data/random_flip_op.cc | 90 ------ paddle/fluid/operators/data/random_flip_op.h | 84 ------ .../operators/data/random_roi_generator.h | 1 - .../operators/data/unity_build_rule.cmake | 13 +- paddle/fluid/operators/data/utils.h | 13 +- python/paddle/fluid/dataloader/pipeline.py | 6 - 22 files changed, 24 insertions(+), 917 deletions(-) delete mode 100644 paddle/fluid/operators/data/data_scope.h delete mode 100644 paddle/fluid/operators/data/random_flip_op.cc delete mode 100644 paddle/fluid/operators/data/random_flip_op.h diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 386529476f947b..d83d91b768dea2 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -22,7 +22,6 @@ op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_ op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) -op_library(random_flip_op SRCS random_flip_op.cc DEPS ${OP_HEADER_DEPS}) op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.cu DEPS ${OP_HEADER_DEPS}) # register_operators() diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index f9b2a65d397d1d..dfd45a3a0073e0 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -28,7 +28,6 @@ class GPUBatchDecodeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int num_threads = ctx.Attr("num_threads"); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads; auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); auto program_id = ctx.Attr("program_id"); @@ -65,8 +64,6 @@ class GPUBatchDecodeKernel : public framework::OpKernel { } decode_pool->RunAll(true); - - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index f0e201e8196ddc..1a211332d2a30a 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -63,22 +63,6 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "num_attempts should be a positive integerm, but " "received %d", num_attempts)); - - // auto mode = ctx->Attrs().Get("mode"); - // std::vector out_dims; - // - // if (mode == "unchanged") { - // out_dims = {-1, -1, -1}; - // } else if (mode == "gray") { - // out_dims = {1, -1, -1}; - // } else if (mode == "rgb") { - // out_dims = {3, -1, -1}; - // } else { - // PADDLE_THROW(platform::errors::Fatal( - // "The provided mode is not supported for JPEG files on GPU: ", mode)); - // } - // - // ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); } protected: diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index c3837b61c9bd34..b2dc8a8238be8c 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -51,7 +51,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { const framework::LoDTensorArray* inputs = ctx.Input("X"); int batch_size = inputs->size(); - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute start, num_threads: " << num_threads << ", batch_size: " << batch_size << ", program_id: " << program_id; auto* out = ctx.OutputVar("Out"); auto dev = platform::CUDAPlace(local_rank); @@ -125,8 +124,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { trans(dev_ctx, temp_array[i], &out_array[i], axis); } } - - LOG(ERROR) << "GPUBatchDecodeJpegKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index c0764128c6cc03..2344601840b886 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -38,8 +38,7 @@ __global__ void KeNearestNeighborInterpFw( int out_id_h = tid / output_w; // single image's index int out_id_w = tid % output_w; - // input_w or output_w = c * h * w - // img_size = h * w + // input_w or output_w = c * h * w, img_size = h * w int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; @@ -91,8 +90,7 @@ __global__ void KeBilinearInterpFw( int out_id_h = tid / output_w; // single image's index int out_id_w = tid % output_w; - // input_w or output_w = c * h * w - // img_size = h * w + // input_w or output_w = c * h * w, img_size = h * w int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; @@ -212,7 +210,6 @@ template class BatchResizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "BatchResizeCUDAKernel Compute start"; PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); @@ -254,20 +251,11 @@ class BatchResizeCUDAKernel : public framework::OpKernel { data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; - // GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, - // &crop_w, seed); - auto out_tensor = out->Slice(i, i + 1); ResizeFwd(ctx, *img, &out_tensor, size, interp_method, align_corners, align_mode, img_h, img_w, img_c, data_layout); } - - // framework::LoDTensorArray out_array; - // out_array.reserve(1); - // out_array.emplace_back(out); - // out_queue->Push(out_array); - LOG(ERROR) << "BatchResizeCUDAKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 4d5a81fc83648d..7a287a137a9ae5 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -64,7 +64,6 @@ class DataReaderOp : public framework::OperatorBase { auto output_var_names = Attr>("output_var_names"); auto* reader_block = Attr("reader_block"); auto reader_id = Attr("reader_id"); - LOG(ERROR) << "DataReaderOp enter, reader_id: " << reader_id; auto output_queues = GetQueueVecFromVariableVec(output_vars); ReaderManager::Instance()->StartDataReader( diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 62a93b59685060..61831cd8b2d61c 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -50,13 +50,12 @@ class Sampler { drop_last_(drop_last), rank_(rank), world_size_(world_size) { - LOG(ERROR) << "Sampler num_samples " << num_samples; int trunc_num_samples; if (drop_last) { int total_batch_size = world_size * batch_size; trunc_num_samples = floor(num_samples / total_batch_size) * total_batch_size; sample_ids_.reserve(trunc_num_samples); - LOG(ERROR) << " Trunc sampler num_samples " << trunc_num_samples; + VLOG(4) << "Sampler trunc sampler num_samples " << trunc_num_samples; } else{ sample_ids_.reserve(num_samples); @@ -66,7 +65,6 @@ class Sampler { sample_ids_.emplace_back(i); } num_samples_ = sample_ids_.size(); - LOG(ERROR) << " Final num_samples " << num_samples_; if (shuffle) { rnd_.seed(seed); std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); @@ -80,17 +78,14 @@ class Sampler { current_iter_++; if (start_idx >= num_samples_) { - LOG(ERROR) << " start idx >= num samples " << start_idx << " >= " << num_samples_; + VLOG(4) << " start idx >= num samples " << start_idx << " >= " << num_samples_; return; } - // if (drop_last_ && start_idx + batch_size_ >= num_samples_) return; - // int64_t batch_len = std::min(batch_size_, num_samples_ - start_idx); - // indices->reserve(batch_len); for (int64_t i = 0; i < batch_size_; i++) { int cur_idx = start_idx + i * world_size_; if (cur_idx >= num_samples_) { - LOG(ERROR) << " cur_idx >= num samples " << cur_idx << " >= " << num_samples_; + VLOG(4) << " cur_idx >= num samples " << cur_idx << " >= " << num_samples_; return; } indices->emplace_back(sample_ids_[cur_idx]); @@ -164,7 +159,6 @@ class DataReader { sampler_.GetNextIndices(&indices); // shutdown reader if indices drained if (indices.size() == 0) { - LOG(ERROR) << "DataReader indices drained"; for(auto& queue: output_queues_) { while (queue->Size()) sleep(0.5); queue->Close(); diff --git a/paddle/fluid/operators/data/data_scope.h b/paddle/fluid/operators/data/data_scope.h deleted file mode 100644 index 92939fa86a2024..00000000000000 --- a/paddle/fluid/operators/data/data_scope.h +++ /dev/null @@ -1,280 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -extern "C" { -#include -} - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/reader/blocking_queue.h" - -// When in inference scenario, the scopes will not be written by two threads in -// a mean time, but a scope may be read by multiple threads concurrently, and -// the mutex will cause serious performance issue. -// So the mutex is disabled when `ON_INFER`. -#ifdef PADDLE_ON_INFERENCE -#define SCOPE_KIDS_READER_LOCK -#define SCOPE_KIDS_WRITER_LOCK -#define SCOPE_VARS_READER_LOCK -#define SCOPE_VARS_WRITER_LOCK -#else -#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); -#endif - -namespace paddle { -namespace framework { - -class Scope; -class Variable; - -template -using BlockingQueue = operators::reader::BlockingQueue; - -/** - * @brief DataScope that manage all variables in data pipeline. - * - * In data pipeline, we need a queue between each OPs to buffer data - * to support data prefetch and OP running asynchronously, DataScope - * contains name -> Variable map as {name: BlockingQueue} - */ -class DataScope : public Scope { - public: - - DataScope() {} - - /// Create a sub-scope. Returns a reference other than a pointer so - /// to prevent from manual deletion. - /// Mark it to const because that new kid scope cannot change parent scope. - DataScope& NewScope() const { - DataScope* child = new DataScope(this); - { - SCOPE_KIDS_WRITER_LOCK - kids_.push_back(child); - } - return *child; - } - - /// Create a sub-scope for current scope but do not record it in the kids to - /// avoid performance problems. - std::unique_ptr NewTmpScope() const { - return std::unique_ptr(new DataScope(this)); - } - - // void EraseVars(const std::vector& var_names) { - // std::set var_set(var_names.begin(), var_names.end()); - // SCOPE_VARS_WRITER_LOCK - // for (auto it = var_queues_.begin(); it != var_queues_.end();) { - // if (var_set.find(it->first) != var_set.end()) { - // it = var_queues_.erase(it); - // } else { - // ++it; - // } - // } - // } - // - // void EraseVarsExcept(const std::unordered_set& vars) { - // SCOPE_VARS_WRITER_LOCK - // for (auto iter = var_queues_.begin(); iter != var_queues_.end();) { - // if (vars.count(iter->second.get()) != 0) { - // ++iter; - // } else { - // var_queues_.erase(iter++); - // } - // } - // } - - // /// Find a variable in the scope or any of its ancestors. Returns - // /// nullptr if cannot find. - // /// Caller doesn't own the returned Variable. - // Variable* FindVar(const std::string& name) const { - // SCOPE_VARS_READER_LOCK - // return FindVarInternal(name); - // } - - // // Get a variable in the scope or any of its ancestors. Enforce - // /// the returned Variable is not nullptr - // Variable* GetVar(const std::string& name) const { - // auto* var = FindVar(name); - // PADDLE_ENFORCE_NOT_NULL( - // var, platform::errors::NotFound("Cannot find %s in scope.", name)); - // return var; - // } - - /// Find a variable in the current scope. - /// Return nullptr if cannot find. - /// Caller doesn't own the returned Variable. - Variable* FindLocalVar(const std::string& name) const { - SCOPE_VARS_READER_LOCK - return FindVarLocally(name); - } - - const Scope* parent() const { return parent_; } - - /// Find the scope or an ancestor scope that contains the given variable. - // const Scope* FindScope(const Variable* var) const; - - // /// Find the scope or an ancestor scope that contains the given variable name. - // const Scope* FindScope(const std::string& name) const; - - // void DeleteScope(Scope* scope) const; - - // /// Drop all kids scopes belonged to this scope. - // void DropKids(); - - // /// Find if a scope exists in the kid scopes - // bool HasKid(const Scope* scope) const; - - // const std::list& kids() const { return kids_; } - - // enumerate all the variables current contains. - std::vector LocalVarNames() const { - std::vector known_vars; - { - SCOPE_VARS_READER_LOCK - known_vars.reserve(this->var_queues_.size()); - for (auto& p : var_queues_) { - known_vars.emplace_back(p.first); - } - } - return known_vars; - } - - // // Rename variable to a new name - // void Rename(const std::string& origin_name, - // const std::string& new_name) const; - // - // // Rename variable to a new name and return the new name - // std::string Rename(const std::string& origin_name) const; - - protected: - // struct KeyHasher { - // std::size_t operator()(const std::string& key) const { - // return XXH32(key.c_str(), key.size(), 1); - // } - // }; - - mutable std::unordered_map>, KeyHasher> var_queues_; - - private: - // Call NewScope for a sub-scope. - explicit DataScope(Scope const* parent) : parent_(parent) {} - - // Called by Var. - Variable* VarInternal(const std::string& name) { - auto* v = FindVarLocally(name); - if (v != nullptr) return v; - - auto q = GetBlockingQueue(name); - v = new Variable(); - q->Send(*v); - VLOG(3) << "Create Variable BlockingQueue and Create a Variable in it" << name; - return v; - } - - Variable* FindVarInternal(const std::string& name) const { - auto var = FindVarLocally(name); - if (var != nullptr) { - return var; - } - return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); - } - - // // Called by FindScope. - // const Scope* FindScopeInternal(const Variable* var) const { - // for (auto& kv : var_queues_) { - // if (kv.second.get() == var) { - // return this; - // } - // } - // return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); - // } - - // // Called by FindScope. - const Scope* FindScopeInternal(const std::string& name) const { - if (var_queues_.find(name) != var_queues_.end()) { - return this; - } - return (parent_ == nullptr) ? nullptr : parent_->FindScope(name); - } - - // // Called by Rename. - void RenameInternal(const std::string& origin_name, - const std::string& new_name) const { - auto origin_it = var_queues_.find(origin_name); - PADDLE_ENFORCE_NE( - origin_it, var_queues_.end(), - platform::errors::NotFound( - "Original variable with name %s is not found in the scope.", - origin_name)); - auto new_it = var_queues_.find(new_name); - PADDLE_ENFORCE_EQ( - new_it, var_queues_.end(), - platform::errors::AlreadyExists( - "The variable with name %s already exists in the scope.", new_name)); - var_queues_[new_name].reset(origin_it->second.release()); - var_queues_.erase(origin_it); - } - - // Called by FindVarInternal and Var. - Variable* FindVarLocally(const std::string& name) const { - auto it = var_queues_.find(name); - if (it != var_queues_.end()) { - auto q = it->second.get(); - Variable* v = nullptr; - if (q->Size() <= 0 || !q->Receive(v)) { - return nullptr; - } - return v; - } - return nullptr; - } - - BlockingQueue* GetBlockingQueue(const std::string& name) const { - auto it = var_queues_.find(name); - if (it != var_queues_.end()) { - return it->second.get(); - } - auto q = new BlockingQueue(2); - var_queues_.emplace(name, std::unique_ptr>(q)); - return q; - } - - // Scope in `kids_` are owned by this class. - mutable std::list kids_; - const Scope* parent_{nullptr}; - - DISABLE_COPY_AND_ASSIGN(DataScope); - -#ifndef PADDLE_ON_INFERENCE - - private: - mutable RWLock kids_lock_; - mutable RWLock vars_lock_; -#endif -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index c4bf872207fb73..dc227f43cfcaaa 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -22,7 +22,6 @@ template class DataLoaderOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "DataLoaderOpKernel enter"; // Step1: get output vars and attrs auto output_vars = ctx.MultiOutputVar("Out"); auto output_var_names = ctx.OutputNames("Out"); @@ -39,13 +38,10 @@ class DataLoaderOpKernel : public framework::OpKernel { pipeline->ReadNext(output_vars); if (!pipeline->IsRunning()) { - LOG(ERROR) << "DataLoaderOpKernel Pipeline not running, throw EOF"; - // data::PipelineManager::Instance()->ShutDownPipeline(program_id); + VLOG(4) << "DataLoaderOpKernel Pipeline not running, throw EOF"; throw platform::EOFException("DataLoaderOpKernel epoch end", __FILE__, __LINE__); } - - LOG(ERROR) << "DataLoaderOpKernel finish"; } }; diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc index 0081891c6f8b09..3b26438db00d7f 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.cc +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -38,18 +38,6 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dim_indices.size(), 1, platform::errors::InvalidArgument( "Input(Indices) should be a 1-D Tensor")); - - // auto files = ctx->Attrs().Get>("files"); - // auto labels = ctx->Attrs().Get>("labels"); - // PADDLE_ENFORCE_GT(files.size(), 0, - // platform::errors::InvalidArgument( - // "length of files should be greater than 0")); - // PADDLE_ENFORCE_GT(labels.size(), 0, - // platform::errors::InvalidArgument( - // "length of labels should be greater than 0")); - // PADDLE_ENFORCE_EQ(files.size(), labels.size(), - // platform::errors::InvalidArgument( - // "length of labels and files should be equal")); } framework::OpKernelType GetExpectedKernelType( @@ -57,49 +45,6 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel { return framework::OpKernelType(framework::proto::VarType::UINT8, platform::CPUPlace()); } - -// private: -// void RunImpl(const framework::Scope& scope, -// const platform::Place& dev_place) const override { -// LOG(ERROR) << "FileLabelLoaderOp RunImpl start"; -// platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); -// auto& dev_ctx = *pool.Get(dev_place); -// framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); -// framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); -// -// auto* out = scope.FindVar(Output("Out")); -// auto out_queue = out->Get().GetQueue(); -// if (out_queue == nullptr) { -// LOG(ERROR) << "FileLabelLoaderOp init output queue"; -// auto* holder = out->template GetMutable(); -// holder->InitOnce(2); -// out_queue = holder->GetQueue(); -// } -// -// auto* out_label = scope.FindVar(Output("Label")); -// auto out_label_queue = -// out_label->Get().GetQueue(); -// if (out_label_queue == nullptr) { -// LOG(ERROR) << "FileLabelLoaderOp init output label queue"; -// auto* label_holder = -// out_label->template GetMutable(); -// label_holder->InitOnce(2); -// out_label_queue = label_holder->GetQueue(); -// } -// -// ReaderManager::Instance()->GetReader( -// 0, ctx, out_queue.get(), out_label_queue.get()); -// // LoDTensorArray samples = reader_wrapper.reader->Next(); -// // framework::LoDTensorArray out_array; -// // out_array.resize(samples.size()); -// // for (size_t i = 0; i < samples.size(); ++i) { -// // copy_tensor(samples[i], &out_array[i]); -// // } -// // out_queue->Push(out_array); -// LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; -// } - - // std::shared_ptr reader=nullptr; }; class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { @@ -109,41 +54,12 @@ class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Image", "The output image tensor of ReadFileLoader op"); AddOutput("Label", "The output label tensor of ReadFileLoader op"); AddAttr("data_root", "Path of root directory of dataset"); - // AddAttr>("files", "Path of the file to be readed.") - // .SetDefault({}); - // AddAttr>("labels", "Path of the file to be readed.") - // .SetDefault({}); AddComment(R"DOC( This operator read a file. )DOC"); - // AddAttr("root_dir", "Path of the file to be readed.") - // .SetDefault(""); - // AddAttr("batch_size", "Path of the file to be readed.").SetDefault(1); - // AddAttr("rank", "Path of the file to be readed.").SetDefault(0); - // AddAttr("world_size", "Path of the file to be readed.").SetDefault(1); - // AddAttr("reader_id", - // "(int64_t)" - // "The unique hash id used as cache key for " - // "ExecutorInfoCache").SetDefault(0);; } }; -// class FileLabelReaderInferShape : public framework::InferShapeBase { -// public: -// void operator()(framework::InferShapeContext* context) const override { -// OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", -// "FileLabelReader"); -// } -// }; -// -// class FileLabelReaderInferVarType : public framework::VarTypeInference { -// public: -// void operator()(framework::InferVarTypeContext* ctx) const override { -// ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, -// framework::ALL_ELEMENTS); -// } -// }; - } // namespace data } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index 9f2d6d1098d3f7..7e6b0a555acafe 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -33,46 +33,6 @@ namespace data { using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; -// static void ParseClasses(const std::string data_root, -// std::vector* classes) { -// _finddata_t findData; -// auto handle = _findfirst(data_root, &findData); -// PADDLE_ENFORCE_NE(handle, -1, platform::errors::InvalidArgument( -// "Cannot find files under data_root")); -// -// do { -// if (findData.attrib & _A_SUBDIRi && findData.name != "." -// && findData.name != "..") { -// classes->emplace_back(findData.name); -// } -// } while (_findnext(handle, &findData) == 0); -// -// std::sort(classes->begin(), classes->end()); -// for (size_t i = 0; i < classes->size(); i++) { -// LOG(ERROR) << "class id " << i << ": " << classes->at(i); -// } -// } - -// static void ParseFilesAndLabels(const std::string data_root, -// std::vector* files, -// std::vector labels) { -// std::vector classes; -// ParseClasses(data_root, &classes); -// -// _finddata_t findData; -// for (int i = 0; i < static_cast(classes.size()); i++) { -// auto cls_dir = data_root + "/" + classes[i]; -// auto handle = _findfirst(cls_dir, &findData); -// if (handle == -1) break; -// -// do { -// if (findData.name == "." || findData.name == "..") continue; -// files->emplace_back(cls_dir + "/" + findData.name); -// labels->emplace_back(i); -// } -// } -// } - #ifdef _WIN32 constexpr char DIR_SEP = '\\'; #else @@ -157,202 +117,17 @@ static std::vector>* GetFilesAndLabelsFromCache(cons if (iter == root_to_samples_.end()) { std::vector> samples; ParseFilesAndLabels(data_root, &samples); - LOG(ERROR) << "Init samples: " << samples.size(); + VLOG(4) << "Init sample number: " << samples.size(); root_to_samples_[data_root] = samples; } return &(root_to_samples_[data_root]); } -// class FileDataReader { -// public: -// explicit FileDataReader(const framework::ExecutionContext& ctx, -// LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) -// : queue_(queue), label_queue_(label_queue){ -// std::vector files = -// ctx.Attr>("files"); -// std::vector labels = ctx.Attr>("labels"); -// rank_ = ctx.Attr("rank"); -// world_size_ = ctx.Attr("world_size"); -// -// batch_size_ = ctx.Attr("batch_size"); -// current_epoch_ = 0; -// current_iter_ = 0; -// // iters_per_epoch_ = labels.size() / (batch_size_ * world_size_); -// auto total_batch_size = batch_size_ * world_size_; -// iters_per_epoch_ = (labels.size() + total_batch_size) / total_batch_size; -// is_closed_ = false; -// for (int i = 0, n = files.size(); i < n; i++) -// image_label_pairs_.emplace_back(std::move(files[i]), labels[i]); -// StartLoadThread(); -// } -// -// int GetStartIndex() { -// int start_idx = -// batch_size_ * world_size_ * (current_iter_ % iters_per_epoch_) + -// rank_ * batch_size_; -// current_iter_++; -// return start_idx; -// } -// -// framework::LoDTensor ReadSample(const std::string filename) { -// std::ifstream input(filename.c_str(), -// std::ios::in | std::ios::binary | std::ios::ate); -// std::streamsize file_size = input.tellg(); -// -// input.seekg(0, std::ios::beg); -// -// // auto* out = ctx.Output("Out"); -// framework::LoDTensor out; -// std::vector out_shape = {file_size}; -// out.Resize(framework::make_ddim(out_shape)); -// -// uint8_t* data = out.mutable_data(platform::CPUPlace()); -// -// input.read(reinterpret_cast(data), file_size); -// return out; -// } -// -// void StartLoadThread() { -// if (load_thrd_.joinable()) { -// return; -// } -// -// load_thrd_ = std::thread([this] { -// while (!is_closed_.load()) LoadBatch(); -// }); -// } -// -// void ShutDown() { -// if (queue_ && !queue_->IsClosed()) queue_->Close(); -// if (label_queue_ && !label_queue_->IsClosed()) label_queue_->Close(); -// -// is_closed_.store(true); -// if (load_thrd_.joinable()) { -// load_thrd_.join(); -// } -// } -// -// -// std::pair> Read() { -// LoDTensorArray ret; -// std::vector label; -// ret.reserve(batch_size_); -// int start_index = GetStartIndex(); -// for (int32_t i = start_index; i < start_index + batch_size_; ++i) { -// if (static_cast(i) >= image_label_pairs_.size()) { -// // FIXME(dkp): refine close pipeline -// while (queue_->Size()) sleep(0.5); -// queue_->Close(); -// while (label_queue_->Size()) sleep(0.5); -// label_queue_->Close(); -// -// is_closed_.store(true); -// break; -// } -// i %= image_label_pairs_.size(); -// framework::LoDTensor tmp = ReadSample(image_label_pairs_[i].first); -// ret.push_back(std::move(tmp)); -// label.push_back(image_label_pairs_[i].second); -// } -// return std::make_pair(ret, label); -// } -// -// -// void LoadBatch() { -// -// auto batch_data = std::move(Read()); -// queue_->Push(batch_data.first); -// framework::LoDTensor label_tensor; -// LoDTensorArray label_array; -// // auto& label_tensor = label.GetMutable(); -// label_tensor.Resize( -// framework::make_ddim({static_cast(batch_data.first.size())})); -// platform::CPUPlace cpu; -// auto* label_data = label_tensor.mutable_data(cpu); -// for (size_t i = 0; i < batch_data.first.size(); ++i) { -// label_data[i] = batch_data.second[i]; -// } -// label_array.push_back(label_tensor); -// label_queue_->Push(label_array); -// } -// -// private: -// int batch_size_; -// std::string file_root_, file_list_; -// std::vector> image_label_pairs_; -// int current_epoch_; -// int current_iter_; -// int rank_; -// int world_size_; -// int iters_per_epoch_; -// std::atomic is_closed_; -// Buffer batch_buffer_; -// std::thread load_thrd_; -// LoDTensorBlockingQueue* queue_; -// LoDTensorBlockingQueue* label_queue_; -// }; -// -// -// class ReaderManager { -// // PipelineManager is a signleton manager for Pipeline, we -// // create single Pipeline for a program id -// private: -// DISABLE_COPY_AND_ASSIGN(ReaderManager); -// -// static ReaderManager *rm_instance_ptr_; -// static std::mutex m_; -// -// std::map> prog_id_to_reader_; -// -// public: -// static ReaderManager *Instance() { -// if (rm_instance_ptr_ == nullptr) { -// std::lock_guard lk(m_); -// if (rm_instance_ptr_ == nullptr) { -// rm_instance_ptr_ = new ReaderManager; -// } -// } -// return rm_instance_ptr_; -// } -// -// // FileDataReader* GetReader( -// void GetReader( -// int64_t program_id, const framework::ExecutionContext& ctx, -// LoDTensorBlockingQueue* queue, LoDTensorBlockingQueue* label_queue) { -// auto iter = prog_id_to_reader_.find(program_id); -// if (iter == prog_id_to_reader_.end()) { -// prog_id_to_reader_[program_id] = std::unique_ptr(new FileDataReader(ctx, queue, label_queue)); -// // return prog_id_to_reader_[program_id].get(); -// } else { -// // return iter->second.get(); -// } -// } -// -// void ShutDown() { -// auto iter = prog_id_to_reader_.begin(); -// while (iter != prog_id_to_reader_.end()){ -// if(iter->second.get()){ -// iter->second->ShutDown(); -// } -// iter++; -// } -// prog_id_to_reader_.clear(); -// } -// -// ReaderManager() { VLOG(1) << "ReaderManager init"; } -// -// ~ReaderManager() { -// VLOG(1) << "~ReaderManager"; -// ShutDown(); -// } -// }; - template class FileLabelLoaderCPUKernel: public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "FileLabelLoaderOp RunImpl start"; auto* indices = ctx.Input("Indices"); auto* image_arr = ctx.Output("Image"); auto* label_tensor = ctx.Output("Label"); @@ -389,38 +164,7 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { image_arr->emplace_back(image); label_data[i] = static_cast(label); } - - LOG(ERROR) << "FileLabelLoaderOp RunImpl finish"; - - // auto out_queue = out->Get().GetQueue(); - // if (out_queue == nullptr) { - // LOG(ERROR) << "FileLabelLoaderOp init output queue"; - // auto* holder = out->template GetMutable(); - // holder->InitOnce(2); - // out_queue = holder->GetQueue(); - // } - // - // auto* out_label = scope.FindVar(Output("Label")); - // auto out_label_queue = - // out_label->Get().GetQueue(); - // if (out_label_queue == nullptr) { - // LOG(ERROR) << "FileLabelLoaderOp init output label queue"; - // auto* label_holder = - // out_label->template GetMutable(); - // label_holder->InitOnce(2); - // out_label_queue = label_holder->GetQueue(); - // } - - // ReaderManager::Instance()->GetReader( - // 0, ctx, out_queue.get(), out_label_queue.get()); - // LoDTensorArray samples = reader_wrapper.reader->Next(); - // framework::LoDTensorArray out_array; - // out_array.resize(samples.size()); - // for (size_t i = 0; i < samples.size(); ++i) { - // copy_tensor(samples[i], &out_array[i]); - // } - // out_queue->Push(out_array); - } + } private: void copy_tensor(const framework::LoDTensor& lod_tensor, diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index 56097316b22a16..ce1778ac710bdd 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -20,7 +20,6 @@ using framework::Tensor; class MapOp : public framework::OperatorBase { public: - // using framework::OperatorWithKernel::OperatorWithKernel; MapOp(const std::string& type, const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 84d85ed139c377..e4e211b5f45506 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -60,7 +60,6 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // If input queue closed, namely EOE(end of epoch) from // dataset reader to here, read failed auto queue = input_queues_[i]; - // if (queue->IsClosed()) return false; // read LoDTensorArray from queue bool success = true; @@ -98,13 +97,6 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { dst_tensor_arr.reserve(tensor_arr.size()); for (size_t i = 0; i < tensor_arr.size(); i++) { dst_tensor_arr.emplace_back(tensor_arr[i]); - // auto tensor = tensor_arr[i]; - // auto dst_tensor = dst_tensor_arr[i]; - // // dst_tensor.Resize(tensor.dims()); - // // dst_tensor.mutable_data(tensor.place(), tensor.type()); - // // dst_tensor.ShareDataWith(tensor); - // copy_tensor(tensor, &dst_tensor); - // // dst_tensor.set_lod(tensor.lod()); } } } @@ -132,7 +124,6 @@ void MapRunner::StartMapThread(const Scope* scope) { if (shutdown_) break; // Step 1: get input LoDTensor and share into Scope - // LOG(ERROR) << "MapThread Loop " << program_id_ << " start"; bool success = ShareInputsIntoScope(&scope_); if (!success) { for(auto& queue : output_queues_) { @@ -142,7 +133,6 @@ void MapRunner::StartMapThread(const Scope* scope) { running_ = false; continue; } - // LOG(ERROR) << "MapThread Loop " << program_id_ << " ShareInputsIntoScope finish"; // Step 2: run ops by executor without fetch try { @@ -150,7 +140,6 @@ void MapRunner::StartMapThread(const Scope* scope) { } catch(...) { break; } - // LOG(ERROR) << "MapThread Loop " << program_id_ << " program run finish"; // Step 3: fetch output variable to LoDTensor vector // and push to output queue @@ -176,10 +165,8 @@ void MapRunner::StartMapThread(const Scope* scope) { output_queues_[i]->Push(t_arr); } } - // LOG(ERROR) << "MapThread Loop " << program_id_ << " push queue finish"; } scope->DeleteScope(&scope_); - // LOG(ERROR) << "MapThread Loop " << program_id_ << " delete scope and return"; }); } diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index 7854c725bd9dee..c28517b3bb4e7c 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -101,7 +101,7 @@ void Pipeline::ReadNext(std::vector &out_vars) { PADDLE_ENFORCE_EQ(success, true, platform::errors::PreconditionNotMet("Read from output queue %s failed", output_var_names_[i])); - // CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); + CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); copy_tensor(outputs.at(0), out_vars[i]->GetMutable()); for (auto &output: outputs) output.clear(); outputs.clear(); diff --git a/paddle/fluid/operators/data/random_crop_and_resize_op.cc b/paddle/fluid/operators/data/random_crop_and_resize_op.cc index 55afed383c9bd0..f77be6f27bba62 100644 --- a/paddle/fluid/operators/data/random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/random_crop_and_resize_op.cc @@ -40,11 +40,6 @@ class RandomCropAndResizeOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "w in Attr(size) of Op(RandomCropAndResize) " "should be greater than 0.")); - // auto x_dim = ctx->GetInputsDim("X"); // NCHW format - // - // std::vector out_dim = {static_cast(x_dim.size()), - // x_dim[0][0], size[0], size[1]}; - // ctx->SetOutputDim("Out", framework::make_ddim({out_dim})); } framework::OpKernelType GetExpectedKernelType( diff --git a/paddle/fluid/operators/data/random_crop_and_resize_op.cu b/paddle/fluid/operators/data/random_crop_and_resize_op.cu index ea92245e3ea190..a8e3e02c95ef96 100644 --- a/paddle/fluid/operators/data/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/random_crop_and_resize_op.cu @@ -41,8 +41,7 @@ __global__ void KeNearestNeighborInterpFw( int out_id_h = tid / output_w; // single image's index int out_id_w = tid % output_w; - // input_w or output_w = c * h * w - // img_size = h * w + // input_w or output_w = c * h * w, img_size = h * w int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; @@ -96,8 +95,7 @@ __global__ void KeBilinearInterpFw( int out_id_h = tid / output_w; // single image's index int out_id_w = tid % output_w; - // input_w or output_w = c * h * w - // img_size = h * w + // input_w or output_w = c * h * w, img_size = h * w int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; @@ -274,7 +272,6 @@ template class RandomCropAndResizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute start"; PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); @@ -285,24 +282,6 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { "The size of X must be greater than 0.")); auto* out = ctx.Output("Out"); - // auto* in_var = ctx.InputVar("X"); - // auto in_queue = in_var->Get().GetQueue(); - // - // auto* out_var = ctx.OutputVar("Out"); - // auto out_queue = out_var->Get().GetQueue(); - // if (out_queue == nullptr) { - // LOG(ERROR) << "RandomCropAndResize out_queue init"; - // auto* holder = out_var->template GetMutable(); - // holder->InitOnce(2); - // out_queue = holder->GetQueue(); - // } - // - // bool success = false; - // auto x = in_queue->Pop(&success); - // PADDLE_ENFORCE_EQ(success, true, - // platform::errors::PreconditionNotMet("Read from input queue failed")); - // framework::LoDTensor out; - // get size, scale, ratio auto size = ctx.Attr>("size"); auto scale = ctx.Attr>("scale"); @@ -342,12 +321,6 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { align_corners, align_mode, img_h, img_w, img_c, idx_h, idx_w, crop_h, crop_w, data_layout); } - - // framework::LoDTensorArray out_array; - // out_array.reserve(1); - // out_array.emplace_back(out); - // out_queue->Push(out_array); - LOG(ERROR) << "RandomCropAndResizeCUDAKernel Compute finish"; } }; diff --git a/paddle/fluid/operators/data/random_flip_op.cc b/paddle/fluid/operators/data/random_flip_op.cc deleted file mode 100644 index eefba4b2b021b8..00000000000000 --- a/paddle/fluid/operators/data/random_flip_op.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/data/random_flip_op.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace paddle { -namespace operators { -namespace data { - -using framework::OpKernelType; -using framework::Tensor; - -class RandomFlipOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of RandomFlipOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of RandomFlipOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", phi::make_ddim({x_dims[0], 1})); - ctx->ShareLoD("X", "Out"); - } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(input_data_type, - platform::CPUPlace()); - } -}; - -class RandomFlipOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of flip op."); - AddOutput("Out", "(Tensor), The output tensor in shape of [N, 1], N is " - "the batch size of X, bool data indicates whether to " - "perform flip in this sample."); - AddAttr("probability", "The probability to flip each sample.") - .SetDefault(0.5); - AddAttr("seed", "The seed for uniform random generator") - .SetDefault(0); - AddComment(R"DOC( - Random Flip Operator. - )DOC"); - } -}; - -class RandomFlipOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map& GetInputOutputWithSameType() - const override { - static std::unordered_map m{{"X", /*->*/ "Out"}}; - return m; - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::data; -namespace plat = paddle::platform; -REGISTER_OPERATOR(random_flip, ops::RandomFlipOp, ops::RandomFlipOpMaker, ops::RandomFlipOpInferVarType); - -REGISTER_OP_CPU_KERNEL( - random_flip, ops::RandomFlipCPUKernel, - ops::RandomFlipCPUKernel, - ops::RandomFlipCPUKernel); diff --git a/paddle/fluid/operators/data/random_flip_op.h b/paddle/fluid/operators/data/random_flip_op.h deleted file mode 100644 index bbf38806de1e52..00000000000000 --- a/paddle/fluid/operators/data/random_flip_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { -namespace data { - -using Tensor = framework::Tensor; - -constexpr size_t dim_bitset_size = 64; - -class RandomFlipGenerator { - public: - RandomFlipGenerator(int seed, float prob) - : distribution_(prob), - seed_(seed) { - if (seed != 0) rng_.seed(seed); - else rng_.seed(time(0)); - } - - ~RandomFlipGenerator() = default; - - bool Generate() { return distribution_(rng_); } - - private: - std::bernoulli_distribution distribution_; - int seed_; - std::mt19937 rng_; -}; - -std::map> seed_to_generator_; - -static RandomFlipGenerator* CreateRandomFlipGenerator(int seed, float prob) { - auto iter = seed_to_generator_.find(seed); - if (iter == seed_to_generator_.end()) { - seed_to_generator_[seed] = std::unique_ptr( - new RandomFlipGenerator(seed, prob)); - } - - return seed_to_generator_[seed].get(); -} - -template -class RandomFlipCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - - auto prob = ctx.Attr("probability"); - auto seed = ctx.Attr("seed"); - - auto* data = out->mutable_data(ctx.GetPlace()); - auto* generator = CreateRandomFlipGenerator(seed, prob); - for (int64_t i = 0; i < x->dims()[0]; i++) { - data[i] = generator->Generate(); - } - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/data/random_roi_generator.h b/paddle/fluid/operators/data/random_roi_generator.h index 80e2675817b0d3..57bfb93ed09e9c 100644 --- a/paddle/fluid/operators/data/random_roi_generator.h +++ b/paddle/fluid/operators/data/random_roi_generator.h @@ -62,7 +62,6 @@ class GeneratorManager { using Generators = std::vector>; private: - // DISABLE_COPY_AND_ASSIGN(GeneratorManager); static GeneratorManager* gm_instance_ptr_; static std::mutex m_; diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index e9bde0c1f0ccc7..99146bf156dd69 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -12,11 +12,16 @@ register_unity_group(cc dataloader_op.cc map_op.cc batch_decode_random_crop_op.cc - random_flip_op.cc - mirror_normalize_op.cc) + batch_decode_op.cc + batch_resize_op.cc + mirror_normalize_op.cc + random_crop_and_resize_op.cc) register_unity_group(cu dataloader_op.cu.cc map_op.cu.cc - batch_decode_random_crop_op.cu, - mirror_normalize_op.cu) + batch_decode_random_crop_op.cu + batch_decode_op.cu + batch_resize_op.cu + mirror_normalize_op.cu + random_crop_and_resize_op.cu) diff --git a/paddle/fluid/operators/data/utils.h b/paddle/fluid/operators/data/utils.h index f043574fa77d50..b0ff265820c262 100644 --- a/paddle/fluid/operators/data/utils.h +++ b/paddle/fluid/operators/data/utils.h @@ -29,15 +29,12 @@ void ShutDownAllDataLoaders() { VLOG(4) << "ShutDownAllDataLoaders enter"; // step 1: shutdown reader ReaderManager::Instance()->ShutDown(); - // LOG(ERROR) << "ShutDownAllDataLoaders reader_wrapper shutdown finish"; // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); - // LOG(ERROR) << "ShutDownAllDataLoaders decode_pool shutdown finish"; // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - // LOG(ERROR) << "ShutDownAllDataLoaders MapRunner shutdown finish"; // step 3: shutdown Pipeline PipelineManager::Instance()->ShutDown(); @@ -45,32 +42,29 @@ void ShutDownAllDataLoaders() { } void ShutDownReadersAndDecoders(const int64_t program_id) { - LOG(ERROR) << "ShutDownReadersAndDecoders enter, program_id: " << program_id; // step 1: shutdown reader ReaderManager::Instance()->ShutDownReader(program_id); // step 2: shutdown decoder ImageDecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); - LOG(ERROR) << "ShutDownReadersAndDecoders finish"; } void ShutDownMaps(const std::vector program_ids) { - LOG(ERROR) << "ShutDownMaps enter, maps size: " << program_ids.size(); for (auto& program_id : program_ids) { MapRunnerManager::Instance()->ShutDownMapRunner(program_id); } - LOG(ERROR) << "ShutDownMaps finish"; } void ShutDownPipeline(const int64_t program_id) { - LOG(ERROR) << "ShutDownPipeline program_id " << program_id << " enter"; PipelineManager::Instance()->ShutDownPipeline(program_id); - LOG(ERROR) << "ShutDownPipeline program_id " << program_id << " finish"; } void ResetDataLoader(const int64_t reader_id, const std::vector map_ids, const int64_t pipeline_id) { + VLOG(4) << "ResetDataLoader enter, reader_id: " << reader_id \ + << ", map_ids size: " << map_ids.size() << ", pipeline_id: " \ + << pipeline_id; // step 1: reset readers ReaderManager::Instance()->ResetReader(reader_id); @@ -81,6 +75,7 @@ void ResetDataLoader(const int64_t reader_id, // step3: reset pipeline PipelineManager::Instance()->ResetPipeline(pipeline_id); + VLOG(4) << "ResetDataLoader finish"; } } // namespace data diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 14346a36bccc08..67b138e3c4253c 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -133,13 +133,7 @@ def __next__(self): self._output_vars = self._prepare_output_vars() try: - import sys - import time - tic = time.time() _C_ops.dataloader(self._output_vars, *self._attrs) - toc = time.time() - print("_C_ops calling cost {}ms".format((toc - tic) * 1000.)) - sys.stdout.flush() except: raise StopIteration From a27a9ceac54e52cf3cec5c7c59b25093723ab421 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 28 Mar 2022 09:16:03 +0000 Subject: [PATCH 75/95] rename data_io_queue -> dataloader_pass. test=develop --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- paddle/fluid/framework/details/build_strategy.cc | 4 ++-- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../ir/{data_io_queue_pass.cc => dataloader_queue_pass.cc} | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) rename paddle/fluid/framework/ir/{data_io_queue_pass.cc => dataloader_queue_pass.cc} (96%) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 1cc4cba7def1df..f4a49a0c1f7cde 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass fuse_gemm_epilogue_pass data_io_queue_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass dataloader_queue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 71769f56760a8d..bcc4ed9f7b9272 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -84,7 +84,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to check whether the multi_device_graph is right. AppendPass("multi_devices_check_pass"); - AppendPass("data_io_queue_pass"); + AppendPass("dataloader_queue_pass"); SetCollectiveContext(); } @@ -505,7 +505,7 @@ USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); USE_PASS(add_reader_dependency_pass); -USE_PASS(data_io_queue_pass); +USE_PASS(dataloader_queue_pass); #ifdef PADDLE_WITH_CINN USE_PASS(build_cinn_pass); #endif diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 245a6eb5448a17..70127144500187 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -99,7 +99,7 @@ pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) pass_library(mixed_precision_configure_pass inference) pass_library(generate_pass DEPS pass_desc_proto) -pass_library(data_io_queue_pass base) +pass_library(dataloader_queue_pass base) target_link_libraries(generate_pass pass_desc_proto) if(WITH_TENSORRT) diff --git a/paddle/fluid/framework/ir/data_io_queue_pass.cc b/paddle/fluid/framework/ir/dataloader_queue_pass.cc similarity index 96% rename from paddle/fluid/framework/ir/data_io_queue_pass.cc rename to paddle/fluid/framework/ir/dataloader_queue_pass.cc index d283e1edef1539..dc9e7ac024cfa2 100644 --- a/paddle/fluid/framework/ir/data_io_queue_pass.cc +++ b/paddle/fluid/framework/ir/dataloader_queue_pass.cc @@ -94,7 +94,7 @@ static void ProcessInputArrayOp(ir::Graph* graph) { } } -class DataIOQueuePass: public Pass { +class DataLoaderQueuePass: public Pass { protected: void ApplyImpl(ir::Graph* graph) const override { ProcessOutputQueueHolderOp(graph); @@ -106,4 +106,4 @@ class DataIOQueuePass: public Pass { } // namespace framework } // namespace paddle -REGISTER_PASS(data_io_queue_pass, paddle::framework::ir::DataIOQueuePass); +REGISTER_PASS(dataloader_queue_pass, paddle::framework::ir::DataLoaderQueuePass); From 71900f41c4cd2334aa3f137161ade5f682b3968e Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 29 Mar 2022 07:49:28 +0000 Subject: [PATCH 76/95] refine API and add file_label_loader unittest. test=develop --- python/paddle/fluid/dataloader/pipeline.py | 5 +- python/paddle/fluid/reader.py | 18 +- .../unittests/test_file_label_loader_op.py | 155 ++++++++++++++++++ python/paddle/io/__init__.py | 2 - python/paddle/vision/reader.py | 32 +--- 5 files changed, 168 insertions(+), 44 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_file_label_loader_op.py diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 67b138e3c4253c..75ac0faa6dc882 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -24,7 +24,7 @@ from collections.abc import Sequence, Mapping -__all__ = ["Pipeline"] +__all__ = ["DataPipeline"] CleanupFuncRegistrar.register(core._shutdown_all_dataloaders) @@ -33,7 +33,7 @@ AVAILABLE_OP_TYPES = ['data_reader', 'map'] -class Pipeline: +class DataPipeline(object): """ Data pipeline @@ -52,7 +52,6 @@ def __init__(self, queue_depth=2): if paddle.distributed.ParallelEnv().nranks > 1: paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) - # paddle.distributed.init_parallel_env() def _init_programs(self): self._main_program = fluid.Program() diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index d9a7c7dbafa366..0e6c1f57f411d1 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -22,7 +22,7 @@ from .executor import global_scope from .data_feeder import DataFeeder, BatchedTensorProvider from .multiprocess_utils import multiprocess_queue_set, CleanupFuncRegistrar, _cleanup_mmap, _cleanup, _set_SIGCHLD_handler -from .dataloader import BatchSampler, Dataset, IterableDataset, Pipeline +from .dataloader import BatchSampler, Dataset, IterableDataset, DataPipeline from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, _DatasetKind, default_collate_fn from .dataloader.batch_sampler import _InfiniteIterableSampler from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer @@ -327,6 +327,14 @@ def __init__(self, timeout=0, worker_init_fn=None, persistent_workers=False): + + if callable(dataset): + with DataPipeline() as pipeline: + outputs = func(*args, **kwargs) + pipeline.set_outputs(outputs) + pipeline.build() + return pipeline + self.return_list = return_list self.collate_fn = collate_fn self.use_buffer_reader = use_buffer_reader @@ -434,14 +442,6 @@ def __iter__(self): def __call__(self): return self.__iter__() - @staticmethod - def from_pipeline(func, *args, **kwargs): - with Pipeline() as pipeline: - outputs = func(*args, **kwargs) - pipeline.set_outputs(outputs) - pipeline.build() - return pipeline - @staticmethod def from_generator(feed_list=None, capacity=None, diff --git a/python/paddle/fluid/tests/unittests/test_file_label_loader_op.py b/python/paddle/fluid/tests/unittests/test_file_label_loader_op.py new file mode 100644 index 00000000000000..4f6a3a9633796b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_file_label_loader_op.py @@ -0,0 +1,155 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +from paddle.utils.download import get_path_from_url +from paddle.vision.datasets import DatasetFolder +from paddle.vision.reader import _sampler_manager, file_label_loader + + +DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") +DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" +DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + +class TestFileLabelLoaderStatic(unittest.TestCase): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = False + self.drop_last = False + self.dynamic = False + + if not self.dynamic: + self.build_program() + + def build_program(self): + paddle.enable_static() + self.indices_data = paddle.static.data( + shape=[self.batch_size], dtype='int64', name='indices') + self.sample_data, self.label_data = file_label_loader(self.data_root, self.indices_data) + self.exe = paddle.static.Executor(paddle.CPUPlace()) + paddle.disable_static() + + def loader_function(self, indices): + if paddle.in_dynamic_mode(): + indices = paddle.to_tensor(indices) + return file_label_loader(self.data_root, indices) + else: + paddle.enable_static() + return self.exe.run(paddle.static.default_main_program(), + feed={'indices': indices}, + fetch_list=[self.sample_data, + self.label_data]) + + def test_check_output(self): + self.setup() + + data_folder = DatasetFolder(self.data_root) + samples = [s[0] for s in data_folder.samples] + targets = [s[1] for s in data_folder.samples] + + sampler_id = fluid.layers.utils._hash_with_id( + self.data_root, self.batch_size, + self.shuffle, self.drop_last, + self.dynamic) + sampler = _sampler_manager.get(sampler_id, + batch_size=self.batch_size, + num_samples=len(samples), + shuffle=self.shuffle, + drop_last=self.drop_last) + + num_iters = (len(samples) + self.batch_size - 1) // self.batch_size + for _ in range(num_iters): + indices = next(sampler) + sample, target = self.loader_function(indices) + assert np.array_equal(target, np.array(targets)[indices]) + + +class TestFileLabelLoaderDynamic(TestFileLabelLoaderStatic): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = False + self.drop_last = False + self.dynamic = True + + if not self.dynamic: + self.build_program() + + +class TestFileLabelLoaderStaticShuffle(TestFileLabelLoaderStatic): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = True + self.drop_last = False + self.dynamic = False + + if not self.dynamic: + self.build_program() + + +class TestFileLabelLoaderDynamicShuffle(TestFileLabelLoaderStatic): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = True + self.drop_last = False + self.dynamic = True + + if not self.dynamic: + self.build_program() + + +class TestFileLabelLoaderStaticDropLast(TestFileLabelLoaderStatic): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = True + self.drop_last = True + self.dynamic = False + + if not self.dynamic: + self.build_program() + + +class TestFileLabelLoaderDynamicDropLast(TestFileLabelLoaderStatic): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + self.batch_size = 16 + self.shuffle = True + self.drop_last = True + self.dynamic = True + + if not self.dynamic: + self.build_program() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index 8d362f69ee61b5..b487a3a4dfce37 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -15,7 +15,6 @@ # TODO: define all functions about input & output in this directory from ..fluid.io import DataLoader # noqa: F401 -from ..fluid.dataloader import Pipeline # noqa: F401 from ..fluid.dataloader import Dataset # noqa: F401 from ..fluid.dataloader import IterableDataset # noqa: F401 from ..fluid.dataloader import BatchSampler # noqa: F401 @@ -43,7 +42,6 @@ 'DistributedBatchSampler', 'DataLoader', 'get_worker_info', - 'Pipeline', 'Sampler', 'SequenceSampler', 'RandomSampler', diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index cd4e19d3761623..061ccb9049a21d 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -32,8 +32,9 @@ class _Sampler(object): def __init__(self, batch_size, num_samples, shuffle=False, drop_last=False): self.batch_size = batch_size - self.drop_last = drop_last self.num_samples = num_samples + self.shuffle = shuffle + self.drop_last = drop_last self.start_idx = 0 self.sample_ids = np.arange(num_samples) @@ -181,33 +182,4 @@ def _reader(indices): shuffle=shuffle, drop_last=drop_last, seed=seed) - # inputs = dict() - # attrs = { - # 'root_dir': file_root, - # 'batch_size': batch_size, - # 'files': samples, - # 'labels': targets, - # 'reader_id': unq_reader_id, - # } - # - # helper = LayerHelper("file_label_reader", **locals()) - # out = helper.create_variable( - # name=unique_name.generate("file_label_reader"), - # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - # dtype='uint8') - # - # label = helper.create_variable( - # name=unique_name.generate("file_label_reader"), - # type=core.VarDesc.VarType.LOD_TENSOR, - # dtype='int') - # - # helper.append_op( - # type="file_label_reader", - # inputs=inputs, - # attrs=attrs, - # outputs={"Out": out, - # "Label": label - # }) - - return out, label From d608b849954847e2b90b51eb3e3d0b05b3b50278 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 29 Mar 2022 11:24:31 +0000 Subject: [PATCH 77/95] add mirror normalize unittests. test=develop --- .../unittests/test_mirror_normalize_op.py | 95 +++++++++++++++++++ python/paddle/vision/ops.py | 16 +--- 2 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py diff --git a/python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py b/python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py new file mode 100644 index 00000000000000..f9ee61e52e796e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import copy +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.vision.ops import mirror_normalize + + +def np_mirror_normalize(image, mirror, mean, std): + image = copy.deepcopy(image) + for i, m in enumerate(mirror): + if m[0]: + image[i] = image[i][:, :, -1::-1] + + mean = np.array(mean[:]).reshape([1, 3, 1, 1]) + std = np.array(std[:]).reshape([1, 3, 1, 1]) + + return (image - mean) / std + + +class TestMirrorNormalizeStatic(unittest.TestCase): + def setup(self): + self.image_shape = [2, 3, 2, 2] + self.mirror_shape = [2, 1] + self.mean = [123.675, 116.28, 103.53] + self.std = [58.395, 57.120, 57.375] + + self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + + self.result = np_mirror_normalize(self.image, self.mirror, + self.mean, self.std) + + def test_check_output_dynamic(self): + # NOTE: only supoort CUDA kernel currently + if not core.is_compiled_with_cuda(): + return + + self.setup() + dy_result = mirror_normalize(paddle.to_tensor(self.image), + paddle.to_tensor(self.mirror), + self.mean, self.std) + assert np.allclose(self.result, dy_result.numpy()) + + def test_check_output_static(self): + self.setup() + paddle.enable_static() + + image_data = paddle.static.data(shape=self.image_shape, + dtype='float32', + name="image") + mirror_data = paddle.static.data(shape=self.mirror_shape, + dtype='bool', + name="mirror") + result_data = mirror_normalize(image_data, mirror_data, + self.mean, self.std) + + # NOTE: only supoort CUDA kernel currently + places = [] + if core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + for place in places: + exe = paddle.static.Executor(place) + st_result = exe.run(paddle.static.default_main_program(), + feed={"image": self.image, + "mirror": self.mirror}, + fetch_list=[result_data]) + + assert np.allclose(self.result, st_result) + + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 8b54de32c670c5..9cb2641ae8774c 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1020,7 +1020,6 @@ def image_decode_random_crop(x, name=unique_name.generate("image_decode_random_crop"), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype=x.dtype) - # out = helper.create_variable_for_type_inference('uint8') helper.append_op( type="batch_decode_random_crop", inputs=inputs, attrs=attrs, outputs={"Out": out}) @@ -1034,17 +1033,6 @@ def random_flip(x, prob=0.5, name=None): rand_vec = layers.uniform_random_batch_size_like( x, [1, 1], min=0., max=1.) return rand_vec < prob - # helper = LayerHelper("random_flip", **locals()) - # out = helper.create_variable( - # name=unique_name.generate("random_flip"), - # type=core.VarDesc.VarType.LOD_TENSOR, - # dtype=core.VarDesc.VarType.BOOL) - # helper.append_op( - # type="random_flip", - # inputs={"X": x}, - # outputs={"Out": out}, - # attrs={"probability": prob}) - # return out def mirror_normalize(x, mirror, @@ -1065,6 +1053,10 @@ def _to_list_3(l): mean = _to_list_3(mean) std = _to_list_3(std) + if _non_static_mode(): + return _C_ops.mirror_normalize(x, mirror, "mean", mean, + "std", std) + helper = LayerHelper("mirror_normalize", **locals()) dtype = helper.input_dtype() out = helper.create_variable_for_type_inference(dtype) From 6281d289504fdeaa931aedf95226eee56477d5c3 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 29 Mar 2022 13:22:41 +0000 Subject: [PATCH 78/95] add unittest for random_flip. test=develop --- .../paddle/tests/test_data_api_random_flip.py | 64 +++++++++++++++++++ .../test_ops_file_label_loader.py} | 0 .../test_ops_mirror_normalize.py} | 41 +++++++++++- 3 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 python/paddle/tests/test_data_api_random_flip.py rename python/paddle/{fluid/tests/unittests/test_file_label_loader_op.py => tests/test_ops_file_label_loader.py} (100%) rename python/paddle/{fluid/tests/unittests/test_mirror_normalize_op.py => tests/test_ops_mirror_normalize.py} (70%) diff --git a/python/paddle/tests/test_data_api_random_flip.py b/python/paddle/tests/test_data_api_random_flip.py new file mode 100644 index 00000000000000..632f5e65c4f649 --- /dev/null +++ b/python/paddle/tests/test_data_api_random_flip.py @@ -0,0 +1,64 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import shutil +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +from paddle.vision.ops import random_flip + + +class TestRandomFlip(unittest.TestCase): + def test_errors(self): + try: + data = paddle.ones([16, 3, 32, 32], dtype="float32") + out = random_flip(data, 1.5) + + # should not execute following lines + assert False + except ValueError: + pass + + def test_dynamic(self): + data = paddle.ones([16, 3, 32, 32], dtype="float32") + out = random_flip(data, 0.5) + + assert out.dtype == paddle.bool + assert out.shape == [16, 1] + + def test_static(self): + paddle.enable_static() + input_data = paddle.static.data(shape=[16, 3, 32, 32], dtype="float32", name="input") + out_data = random_flip(input_data, 0.5) + + places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + + for place in places: + exe = paddle.static.Executor(place) + out, = exe.run(paddle.static.default_main_program(), + feed={"input": np.ones([16, 3, 32, 32], dtype="float32")}, + fetch_list=[out_data]) + assert out.dtype == np.bool + assert out.shape == (16, 1) + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_file_label_loader_op.py b/python/paddle/tests/test_ops_file_label_loader.py similarity index 100% rename from python/paddle/fluid/tests/unittests/test_file_label_loader_op.py rename to python/paddle/tests/test_ops_file_label_loader.py diff --git a/python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py b/python/paddle/tests/test_ops_mirror_normalize.py similarity index 70% rename from python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py rename to python/paddle/tests/test_ops_mirror_normalize.py index f9ee61e52e796e..ec172355cb8256 100644 --- a/python/paddle/fluid/tests/unittests/test_mirror_normalize_op.py +++ b/python/paddle/tests/test_ops_mirror_normalize.py @@ -31,16 +31,23 @@ def np_mirror_normalize(image, mirror, mean, std): if m[0]: image[i] = image[i][:, :, -1::-1] + mean = np.array(mean) + std = np.array(std) + if np.size(mean) == 1: + mean = np.tile(mean, (3,)) + if np.size(std) == 1: + std = np.tile(std, (3,)) + mean = np.array(mean[:]).reshape([1, 3, 1, 1]) std = np.array(std[:]).reshape([1, 3, 1, 1]) return (image - mean) / std -class TestMirrorNormalizeStatic(unittest.TestCase): +class TestMirrorNormalize(unittest.TestCase): def setup(self): - self.image_shape = [2, 3, 2, 2] - self.mirror_shape = [2, 1] + self.image_shape = [16, 3, 32, 32] + self.mirror_shape = [16, 1] self.mean = [123.675, 116.28, 103.53] self.std = [58.395, 57.120, 57.375] @@ -91,5 +98,33 @@ def test_check_output_static(self): paddle.disable_static() +class TestMirrorNormalizeSingleMeanStd(TestMirrorNormalize): + def setup(self): + self.image_shape = [16, 3, 32, 32] + self.mirror_shape = [16, 1] + self.mean = [123.675] + self.std = [58.395] + + self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + + self.result = np_mirror_normalize(self.image, self.mirror, + self.mean, self.std) + + +class TestMirrorNormalizeFloatMeanStd(TestMirrorNormalize): + def setup(self): + self.image_shape = [16, 3, 32, 32] + self.mirror_shape = [16, 1] + self.mean = 123.675 + self.std = 58.395 + + self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + + self.result = np_mirror_normalize(self.image, self.mirror, + self.mean, self.std) + + if __name__ == '__main__': unittest.main() From cc51bbfa0b22344342e461319bce9a1e1f328291 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 30 Mar 2022 13:59:29 +0000 Subject: [PATCH 79/95] add test_ops_crop_resize. test=develop --- .../fluid/operators/data/batch_resize_op.cc | 2 +- .../fluid/operators/data/batch_resize_op.cu | 56 +-- python/paddle/fluid/reader.py | 25 +- .../paddle/tests/test_data_api_random_flip.py | 6 +- python/paddle/tests/test_ops_crop_resize.py | 379 ++++++++++++++++++ python/paddle/vision/ops.py | 16 +- 6 files changed, 437 insertions(+), 47 deletions(-) create mode 100644 python/paddle/tests/test_ops_crop_resize.py diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc index 44a3ea866238f9..d3fbbfd17f58ad 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cc +++ b/paddle/fluid/operators/data/batch_resize_op.cc @@ -85,7 +85,7 @@ class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker { "can be \'1\' for src_idx = scale*dst_index .") .SetDefault(1); AddAttr( - "data_layout", + "data_format", "(string, default NCHW) Only used in " "an optional string from: \"NHWC\", \"NCHW\". " "Specify that the data format of the input and output data is " diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index 2344601840b886..4953e39801d3de 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -29,7 +29,7 @@ __global__ void KeNearestNeighborInterpFw( const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { + const bool align_corners, const DataLayout data_format) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -44,7 +44,7 @@ __global__ void KeNearestNeighborInterpFw( // get output c, h, w index int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { channel_id = out_id_w / out_img_size; out_img_idy = (out_id_w % out_img_size) / out_img_w; out_img_idx = tid % out_img_w; @@ -63,7 +63,7 @@ __global__ void KeNearestNeighborInterpFw( ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { out[tid] = in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; } else { @@ -80,7 +80,7 @@ __global__ void KeBilinearInterpFw( const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, const bool align_corners, const int align_mode, - const DataLayout data_layout) { + const DataLayout data_format) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -96,7 +96,7 @@ __global__ void KeBilinearInterpFw( // get output c, h, w index int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { channel_id = out_id_w / out_img_size; out_img_idy = (out_id_w % out_img_size) / out_img_w; out_img_idx = tid % out_img_w; @@ -112,11 +112,11 @@ __global__ void KeBilinearInterpFw( : static_cast(ratio_h * out_img_idy); in_img_idy = in_img_idy > 0 ? in_img_idy : 0; int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + float src_h = ratio_h * (out_img_idy + 0.5) - 0.5; src_h = src_h > 0 ? src_h : 0; - T h1lambda = align_flag ? src_h - in_img_idy + float h1lambda = align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; + float h2lambda = 1.f - h1lambda; // get input w index with offset int in_img_idx = align_flag @@ -124,33 +124,33 @@ __global__ void KeBilinearInterpFw( : static_cast(ratio_w * out_img_idx); in_img_idx = in_img_idx > 0 ? in_img_idx : 0; int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + float src_w = ratio_w * (out_img_idx + 0.5) - 0.5; src_w = src_w > 0 ? src_w : 0; - T w1lambda = align_flag ? src_w - in_img_idx + float w1lambda = align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; + float w2lambda = 1.f - w1lambda; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; // bilinear interpolation - out[out_id_h * output_w + out_id_w] = + out[out_id_h * output_w + out_id_w] = (T)( h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id]); + w1lambda * in_pos[h_id * in_img_w + w_id])); } else { const T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + in_img_idx * num_channels + channel_id]; // bilinear interpolation - out[out_id_h * output_w + out_id_w] = + out[out_id_h * output_w + out_id_w] = (T)( h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + w1lambda * in_pos[h_id * in_img_w * num_channels + - w_id * num_channels]); + w_id * num_channels])); } } } @@ -161,13 +161,13 @@ static void ResizeFwd( framework::Tensor* output, const std::vector out_size, const std::string interp_method, const bool align_corners, const int align_mode, const int img_h, const int img_w, const int c, - const DataLayout data_layout) { + const DataLayout data_format) { auto input_data = input.template data(); int out_h = static_cast(out_size[0]); int out_w = static_cast(out_size[1]); framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { dim_out = {c, out_h, out_w}; } else { dim_out = {out_h, out_w, c}; @@ -196,13 +196,13 @@ static void ResizeFwd( T><<>>( input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1, - out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + out_chw, c, ratio_h, ratio_w, align_corners, data_format); } else if ("bilinear" == interp_method) { KeBilinearInterpFw<<>>( input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, - data_layout); + data_format); } } @@ -223,21 +223,21 @@ class BatchResizeCUDAKernel : public framework::OpKernel { // get size, scale, ratio auto size = ctx.Attr>("size"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_format = + framework::StringToDataLayout(data_format_str); // get interpolation method const std::string interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); auto* img = &x->at(0); - int64_t img_c = data_layout == DataLayout::kNCHW ? \ + int64_t img_c = data_format == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; std::vector out_dim = {static_cast(x->size()), size[0], size[1], img_c}; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { out_dim = {static_cast(x->size()), img_c, size[0], size[1]}; } @@ -248,13 +248,13 @@ class BatchResizeCUDAKernel : public framework::OpKernel { for (int i = 0; i < x->size(); i++) { img = &x->at(i); img_h = - data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; + data_format == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = - data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; + data_format == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; auto out_tensor = out->Slice(i, i + 1); ResizeFwd(ctx, *img, &out_tensor, size, interp_method, align_corners, align_mode, img_h, img_w, img_c, - data_layout); + data_format); } } }; diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 0e6c1f57f411d1..6e7a2cc0da1337 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -184,7 +184,7 @@ class DataLoader(object): Args: - dataset(Dataset): the dataset to load data from, should be an + dataset(Dataset|callable): the dataset to load data from, should be an instance of subclass of :code:`paddle.io.Dataset` or :code:`paddle.io.IterableDataset`. feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list. @@ -329,11 +329,12 @@ def __init__(self, persistent_workers=False): if callable(dataset): - with DataPipeline() as pipeline: - outputs = func(*args, **kwargs) - pipeline.set_outputs(outputs) - pipeline.build() - return pipeline + self._use_data_pipeline = True + with DataPipeline() as self._data_pipeline: + outputs = dataset() + self._data_pipeline.set_outputs(outputs) + self._data_pipeline.build() + return self.return_list = return_list self.collate_fn = collate_fn @@ -428,6 +429,11 @@ def __len__(self): return len(self.dataset) def __iter__(self): + # use DataPipeline + if self._use_data_pipeline: + return self._data_pipeline + + # use multi-process DataLoader if self.num_workers == 0: return _DataLoaderIterSingleProcess(self) elif self._persistent_workers: @@ -442,6 +448,13 @@ def __iter__(self): def __call__(self): return self.__iter__() + def reset(self): + assert self._use_data_pipeline, \ + "reset() can only be used in DataPipeline mode, "\ + "which takes callabe function as dataset input "\ + "instead of paddle.io.Dataset" + self._data_pipeline.reset() + @staticmethod def from_generator(feed_list=None, capacity=None, diff --git a/python/paddle/tests/test_data_api_random_flip.py b/python/paddle/tests/test_data_api_random_flip.py index 632f5e65c4f649..0a24bbe0625adf 100644 --- a/python/paddle/tests/test_data_api_random_flip.py +++ b/python/paddle/tests/test_data_api_random_flip.py @@ -13,8 +13,6 @@ # limitations under the License. import os -import cv2 -import shutil import unittest import numpy as np @@ -34,14 +32,14 @@ def test_errors(self): except ValueError: pass - def test_dynamic(self): + def test_output_dynamic(self): data = paddle.ones([16, 3, 32, 32], dtype="float32") out = random_flip(data, 0.5) assert out.dtype == paddle.bool assert out.shape == [16, 1] - def test_static(self): + def test_output_static(self): paddle.enable_static() input_data = paddle.static.data(shape=[16, 3, 32, 32], dtype="float32", name="input") out_data = random_flip(input_data, 0.5) diff --git a/python/paddle/tests/test_ops_crop_resize.py b/python/paddle/tests/test_ops_crop_resize.py new file mode 100644 index 00000000000000..44efa0f5aac8d8 --- /dev/null +++ b/python/paddle/tests/test_ops_crop_resize.py @@ -0,0 +1,379 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.vision.ops import image_resize + + +def np_nearest_interp(image, + size, + align_corners=True, + data_format='NCHW'): + """nearest neighbor interpolation implement in shape [N, C, H, W]""" + if isinstance(size, int): + size = (size, size) + + if data_format == "NHWC": + image = np.transpose(image, (2, 0, 1)) # HWC => CHW + + channel, in_h, in_w = image.shape + out_h, out_w = size + + ratio_h = ratio_w = 0.0 + if (out_h > 1): + if (align_corners): + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + ratio_h = 1.0 * in_h / out_h + if (out_w > 1): + if (align_corners): + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + ratio_w = 1.0 * in_w / out_w + + out = np.zeros((channel, out_h, out_w)) + + if align_corners: + for i in range(out_h): + in_i = int(ratio_h * i + 0.5) + for j in range(out_w): + in_j = int(ratio_w * j + 0.5) + out[:, i, j] = image[:, in_i, in_j] + else: + for i in range(out_h): + in_i = int(ratio_h * i) + for j in range(out_w): + in_j = int(ratio_w * j) + out[:, i, j] = image[:, in_i, in_j] + + if data_format == "NHWC": + out = np.transpose(out, (1, 2, 0)) # CHW => HWC + + return out.astype(image.dtype) + + +def np_bilinear_interp(image, + size, + align_corners=True, + align_mode=0, + data_format='NCHW'): + """bilinear interpolation implement in shape [N, C, H, W]""" + if isinstance(size, int): + size = (size, size) + + if data_format == "NHWC": + image = np.transpose(image, (2, 0, 1)) # HWC => CHW + + channel, in_h, in_w = image.shape + out_h, out_w = size + + ratio_h = ratio_w = 0.0 + if out_h > 1: + if (align_corners): + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + ratio_h = 1.0 * in_h / out_h + if out_w > 1: + if (align_corners): + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + ratio_w = 1.0 * in_w / out_w + + out = np.zeros((channel, out_h, out_w)) + + for i in range(out_h): + if (align_mode == 0 and not align_corners): + h = int(ratio_h * (i + 0.5) - 0.5) + else: + h = int(ratio_h * i) + + h = max(0, h) + hid = 1 if h < in_h - 1 else 0 + if (align_mode == 0 and not align_corners): + idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0) + h1lambda = idx_src_h - h + else: + h1lambda = ratio_h * i - h + h2lambda = 1.0 - h1lambda + for j in range(out_w): + if (align_mode == 0 and not align_corners): + w = int(ratio_w * (j + 0.5) - 0.5) + else: + w = int(ratio_w * j) + w = max(0, w) + wid = 1 if w < in_w - 1 else 0 + if (align_mode == 0 and not align_corners): + idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0) + w1lambda = idx_src_w - w + else: + w1lambda = ratio_w * j - w + w2lambda = 1.0 - w1lambda + + out[:, i, j] = h2lambda*(w2lambda * image[:, h, w] + + w1lambda * image[:, h, w+wid]) + \ + h1lambda*(w2lambda * image[:, h+hid, w] + + w1lambda * image[:, h+hid, w+wid]) + + if data_format == "NHWC": + out = np.transpose(out, (1, 2, 0)) # CHW => HWC + + return out.astype(image.dtype) + + +def np_image_resize(images, size, interp_method, + align_corners=True, align_mode=1, + data_format="NCHW"): + if isinstance(size, int): + size = (size, size) + + results = [] + if interp_method == "nearest": + for image in images: + results.append(np_nearest_interp(image, + size=size, + align_corners=align_corners, + data_format=data_format)) + elif interp_method == "bilinear": + for image in images: + results.append(np_bilinear_interp(image, + size=size, + align_corners=align_corners, + align_mode=align_mode, + data_format=data_format)) + else: + raise ValueError("unknown interp_method") + + return np.stack(results, axis=0) + + +class TestImageResizeNearestNCHW(unittest.TestCase): + def setup(self): + self.image_shape1 = [3, 8, 8] + self.image_shape2 = [3, 2, 2] + self.size = (4, 4) + self.interp_method = "nearest" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 0 + + self._is_np_built = False + self.build_np_data() + + def build_np_data(self): + if not self._is_np_built: + self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") + self.np_result = np_image_resize( + [self.image1, self.image2], + size=self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + self._is_np_built = True + + def test_output_dynamic(self): + if not core.is_compiled_with_cuda(): + return + + paddle.disable_static() + self.setup() + + images = paddle.tensor.create_array(dtype="uint8") + images = paddle.tensor.array_write(paddle.to_tensor(self.image1), + paddle.to_tensor(0), images) + images = paddle.tensor.array_write(paddle.to_tensor(self.image2), + paddle.to_tensor(1), images) + + # NOTE: image_resize takes TensorArray as input, which cannot + # create by Python API in dynamic mode + try: + dy_result = image_resize(images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + except: + pass + + def test_output_static(self): + if not core.is_compiled_with_cuda(): + return + + paddle.enable_static() + self.setup() + + images = paddle.tensor.create_array(dtype="uint8") + + idx = fluid.layers.fill_constant(shape=[1], dtype="int64", value=0) + image1 = fluid.layers.assign(self.image1.astype('int32')) + image1 = fluid.layers.cast(image1, dtype='uint8') + images = paddle.tensor.array_write(image1, idx, images) + + image2 = fluid.layers.assign(self.image2.astype('int32')) + image2 = fluid.layers.cast(image2, dtype='uint8') + images = paddle.tensor.array_write(image2, idx + 1, images) + + out = image_resize(images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + result, = exe.run(paddle.static.default_main_program(), + fetch_list=[out]) + assert np.allclose(result, self.np_result, rtol=1) + + paddle.disable_static() + + +class TestImageResizeNearestNHWC(TestImageResizeNearestNCHW): + def setup(self): + self.image_shape1 = [32, 32, 3] + self.image_shape2 = [16, 16, 3] + self.size = 20 + self.interp_method = "nearest" + self.data_format = "NHWC" + self.align_corners = True + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + def test_output_dynamic(self): + pass + + +class TestImageResizeNearestNCHWAlignCorner(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [3, 32, 32] + self.image_shape2 = [3, 16, 16] + self.size = 30 + self.interp_method = "nearest" + self.data_format = "NCHW" + self.align_corners = True + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeNearestNHWCAlignCorner(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [32, 32, 3] + self.image_shape2 = [16, 16, 3] + self.size = (20, 30) + self.interp_method = "nearest" + self.data_format = "NHWC" + self.align_corners = True + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNCHW(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [3, 32, 32] + self.image_shape2 = [3, 16, 16] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNHWC(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [32, 32, 3] + self.image_shape2 = [16, 16, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = False + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNCHWAlignMode0(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [3, 32, 32] + self.image_shape2 = [3, 16, 16] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 0 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNHWCAlignMode0(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [32, 32, 3] + self.image_shape2 = [16, 16, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = False + self.align_mode = 0 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNCHWAlignCorner(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [3, 32, 32] + self.image_shape2 = [3, 16, 16] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = True + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +class TestImageResizeBilinearNHWCAlignCorner(TestImageResizeNearestNHWC): + def setup(self): + self.image_shape1 = [32, 32, 3] + self.image_shape2 = [16, 16, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = True + self.align_mode = 1 + + self._is_np_built = False + self.build_np_data() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 9cb2641ae8774c..a4539a9ea58727 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1603,7 +1603,7 @@ def image_resize(x, interp_method='bilinear', align_corners=True, align_mode=1, - data_layout='NCHW', + data_format='NCHW', seed=0, name=None): """ @@ -1627,7 +1627,7 @@ def image_resize(x, align_mode (int32, optional): Optional for bilinear interpolation, can be 0 for src_idx = scale*(dst_indx+0.5)-0.5, can be 1 for src_idx = scale*dst_index. Default: 1 - data_layout (str, optional): Only used in an optional string + data_format (str, optional): Only used in an optional string from: NHWC, NCHW. Specify that the data format of the input and output data is channel_first or channel_last. Default: NCHW seed (int, optional): The random seed. Default: 0 @@ -1643,14 +1643,14 @@ def image_resize(x, .. code-block:: python import paddle - from paddle.vision.ops import random_crop_and_resize + from paddle.vision.ops import image_resize data = paddle.rand([3, 256, 256]) - out = random_crop_and_resize([data]) + out = image_resize([data]) """ - check_type(size, 'size', (int, tuple), 'random_crop_and_resize') + check_type(size, 'size', (int, tuple), 'image_resize') assert interp_method in ['bilinear', 'nearest'] - assert data_layout in ['NCHW', 'NHWC'] + assert data_format in ['NCHW', 'NHWC'] if isinstance(size, int): size = (size, size) @@ -1658,7 +1658,7 @@ def image_resize(x, out = _C_ops.batch_resize( x, "size", size, "interp_method", interp_method, "align_corners", align_corners, "align_mode", - align_mode, "data_layout", data_layout, "seed", seed) + align_mode, "data_format", data_format, "seed", seed) return out helper = LayerHelper('batch_resize', **locals()) @@ -1670,7 +1670,7 @@ def image_resize(x, "interp_method": interp_method, "align_corners": align_corners, "align_mode": align_mode, - "data_layout": data_layout, + "data_format": data_format, "seed": seed, } helper.append_op( From 948e03587d99d3903e5ba782036423b23d320d79 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 30 Mar 2022 15:10:29 +0000 Subject: [PATCH 80/95] add unittest for random_crop_and_resize. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 2 +- .../data/batch_random_crop_and_resize_op.cc | 120 +++++++++ ....cu => batch_random_crop_and_resize_op.cu} | 89 ++++--- ...op.h => batch_random_crop_and_resize_op.h} | 8 +- .../operators/data/unity_build_rule.cmake | 4 +- python/paddle/tests/test_ops_crop_resize.py | 233 +++++++++++++++++- python/paddle/vision/ops.py | 45 ++-- 7 files changed, 432 insertions(+), 69 deletions(-) create mode 100644 paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc rename paddle/fluid/operators/data/{random_crop_and_resize_op.cu => batch_random_crop_and_resize_op.cu} (83%) rename paddle/fluid/operators/data/{random_crop_and_resize_op.h => batch_random_crop_and_resize_op.h} (83%) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index d83d91b768dea2..d2b28e8e5710e8 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -17,7 +17,7 @@ cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HE op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) -op_library(random_crop_and_resize_op SRCS random_crop_and_resize_op.cc random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) +op_library(batch_random_crop_and_resize_op SRCS batch_random_crop_and_resize_op.cc batch_random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc new file mode 100644 index 00000000000000..b644541786083e --- /dev/null +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/data/batch_random_crop_and_resize_op.h" + +namespace paddle { +namespace operators { +namespace data { + +class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchRandomCropAndResize"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "BatchRandomCropAndResize"); + + auto size = ctx->Attrs().Get>("size"); + PADDLE_ENFORCE_EQ(size.size(), 2, + platform::errors::InvalidArgument( + "The length of Attrs(size) should be 2.")); + PADDLE_ENFORCE_GT(size[0], 0, + platform::errors::InvalidArgument( + "h in Attr(size) of Op(BatchRandomCropAndResize) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(size[1], 0, + platform::errors::InvalidArgument( + "w in Attr(size) of Op(BatchRandomCropAndResize) " + "should be greater than 0.")); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::UINT8, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "X") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class BatchRandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); + AddOutput("Out", "(Tensor). The cropped instance batch."); + AddAttr("aspect_ratio_min", "").SetDefault(3./4.); + AddAttr("aspect_ratio_max", "").SetDefault(4./3.); + AddAttr("area_min", "").SetDefault(0.08); + AddAttr("area_max", "").SetDefault(1.); + AddAttr("num_attempts", "").SetDefault(10); + AddAttr>( + "size", "expected output size of the crop, for each edge."); + AddAttr("interp_method", + "(string, default \"bilinear\"), interpolation " + "method, can be \"bilinear\" for " + "bilinear interpolation and \"nearest\" for nearest " + "neighbor interpolation.") + .SetDefault("bilinear"); + AddAttr( + "align_corners", + "an optional bool. Defaults to True. " + "If True, the centers of 4 corner pixels of the input and output " + "tensors are aligned, preserving the values at the corner pixels, " + "If False, are not aligned") + .SetDefault(true); + AddAttr("align_mode", + "(int, default \'1\'), optional for bilinear interpolation, " + "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " + "can be \'1\' for src_idx = scale*dst_index .") + .SetDefault(1); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); + AddComment(R"DOC( + Crop the input data to random size and aspect ratio. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. + After applying crop transfrom, the input data will be resized to given size. + )DOC"); + } +}; + +} // namespace data +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + batch_random_crop_and_resize, ops::data::BatchRandomCropAndResizeOp, + ops::data::BatchRandomCropAndResizeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(batch_random_crop_and_resize, + ops::data::BatchRandomCropAndResizeCPUKernel) diff --git a/paddle/fluid/operators/data/random_crop_and_resize_op.cu b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu similarity index 83% rename from paddle/fluid/operators/data/random_crop_and_resize_op.cu rename to paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu index a8e3e02c95ef96..a663600dad5e04 100644 --- a/paddle/fluid/operators/data/random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/data/random_crop_and_resize_op.h" +#include "paddle/fluid/operators/data/batch_random_crop_and_resize_op.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { @@ -32,7 +32,7 @@ __global__ void KeNearestNeighborInterpFw( const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, const size_t idx_h, const size_t idx_w, const bool align_corners, - const DataLayout data_layout) { + const DataLayout data_format) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -47,7 +47,7 @@ __global__ void KeNearestNeighborInterpFw( // get output c, h, w index int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { channel_id = out_id_w / out_img_size; out_img_idy = (out_id_w % out_img_size) / out_img_w; out_img_idx = tid % out_img_w; @@ -68,7 +68,7 @@ __global__ void KeNearestNeighborInterpFw( : static_cast(ratio_w * out_img_idx); in_img_idx += idx_w; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { out[tid] = in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; } else { @@ -85,7 +85,7 @@ __global__ void KeBilinearInterpFw( const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, const size_t idx_h, const size_t idx_w, const bool align_corners, - const int align_mode, const DataLayout data_layout) { + const int align_mode, const DataLayout data_format) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -101,7 +101,7 @@ __global__ void KeBilinearInterpFw( // get output c, h, w index int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { channel_id = out_id_w / out_img_size; out_img_idy = (out_id_w % out_img_size) / out_img_w; out_img_idx = tid % out_img_w; @@ -135,7 +135,7 @@ __global__ void KeBilinearInterpFw( : ratio_w * out_img_idx + idx_w - in_img_idx; T w2lambda = 1.f - w1lambda; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + in_img_idy * in_img_w + in_img_idx]; @@ -161,19 +161,19 @@ __global__ void KeBilinearInterpFw( } template -static void RandomCropAndResizeFwd( +static void BatchRandomCropAndResizeFwd( const framework::ExecutionContext& ctx, const framework::LoDTensor& input, framework::Tensor* output, const std::vector out_size, const std::string interp_method, const bool align_corners, const int align_mode, const int img_h, const int img_w, const int c, const int idx_h, const int idx_w, const int crop_h, const int crop_w, - const DataLayout data_layout) { + const DataLayout data_format) { auto input_data = input.template data(); int out_h = static_cast(out_size[0]); int out_w = static_cast(out_size[1]); framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { dim_out = {c, out_h, out_w}; } else { dim_out = {out_h, out_w, c}; @@ -207,13 +207,13 @@ static void RandomCropAndResizeFwd( T><<>>( input_data, crop_h, crop_w, 1, in_chw, output_data, out_h, out_w, 1, - out_chw, c, ratio_h, ratio_w, idx_h, idx_w, align_corners, data_layout); + out_chw, c, ratio_h, ratio_w, idx_h, idx_w, align_corners, data_format); } else if ("bilinear" == interp_method) { KeBilinearInterpFw<<>>( input_data, crop_h, crop_w, 1, in_chw, output_data, out_h, out_w, 1, out_chw, c, ratio_h, ratio_w, idx_h, idx_w, align_corners, align_mode, - data_layout); + data_format); } } @@ -269,7 +269,7 @@ static void GetCropParameters(const int height, const int width, } template -class RandomCropAndResizeCUDAKernel : public framework::OpKernel { +class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ( @@ -282,27 +282,39 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { "The size of X must be greater than 0.")); auto* out = ctx.Output("Out"); - // get size, scale, ratio - auto size = ctx.Attr>("size"); - auto scale = ctx.Attr>("scale"); - auto ratio = ctx.Attr>("ratio"); - // get random seed - int seed = ctx.Attr("seed"); - // get data_layout - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + auto aspect_ratio_min = ctx.Attr("aspect_ratio_min"); + auto aspect_ratio_max = ctx.Attr("aspect_ratio_max"); + AspectRatioRange aspect_ratio_range{aspect_ratio_min, aspect_ratio_max}; + + auto area_min = ctx.Attr("area_min"); + auto area_max = ctx.Attr("area_max"); + AreaRange area_range{area_min, area_max}; + + auto* generators = GeneratorManager::Instance()->GetGenerators( + x->size(), x->size(), aspect_ratio_range, + area_range); + + const std::vector size = ctx.Attr>("size"); + + // get data_format + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_format = + framework::StringToDataLayout(data_format_str); // get interpolation method const std::string interp_method = ctx.Attr("interp_method"); bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); auto* img = &x->at(0); - int64_t img_c = data_layout == DataLayout::kNCHW ? \ + int64_t img_c = data_format == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; - std::vector out_dim = {static_cast(x->size()), - img_c, size[0], size[1]}; + std::vector out_dim; + if (data_format == DataLayout::kNCHW) { + out_dim = {static_cast(x->size()), img_c, size[0], size[1]}; + } else { + out_dim = {static_cast(x->size()), size[0], size[1], img_c}; + } out->Resize(phi::make_ddim(out_dim)); out->mutable_data(ctx.GetPlace()); @@ -310,16 +322,19 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { for (int i = 0; i < x->size(); i++) { img = &x->at(i); img_h = - data_layout == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; + data_format == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = - data_layout == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; - GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, - &crop_w, seed); + data_format == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; + ROI roi; + generators->at(i)->GenerateRandomROI(img_w, img_h, &roi); + // GetCropParameters(img_h, img_w, scale, ratio, &idx_h, &idx_w, &crop_h, + // &crop_w, seed); auto out_tensor = out->Slice(i, i + 1); - RandomCropAndResizeFwd(ctx, *img, &out_tensor, size, interp_method, - align_corners, align_mode, img_h, img_w, img_c, - idx_h, idx_w, crop_h, crop_w, data_layout); + BatchRandomCropAndResizeFwd( + ctx, *img, &out_tensor, size, interp_method, align_corners, + align_mode, img_h, img_w, img_c, roi.y, roi.x, roi.h, + roi.w, data_format); } } }; @@ -329,6 +344,6 @@ class RandomCropAndResizeCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(random_crop_and_resize, - ops::data::RandomCropAndResizeCUDAKernel, - ops::data::RandomCropAndResizeCUDAKernel); +REGISTER_OP_CUDA_KERNEL(batch_random_crop_and_resize, + ops::data::BatchRandomCropAndResizeCUDAKernel, + ops::data::BatchRandomCropAndResizeCUDAKernel); diff --git a/paddle/fluid/operators/data/random_crop_and_resize_op.h b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h similarity index 83% rename from paddle/fluid/operators/data/random_crop_and_resize_op.h rename to paddle/fluid/operators/data/batch_random_crop_and_resize_op.h index 86dd1fd95ea0a9..6ab18f3f5e4a70 100644 --- a/paddle/fluid/operators/data/random_crop_and_resize_op.h +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h @@ -22,21 +22,19 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#endif +#include "paddle/fluid/operators/data/random_roi_generator.h" namespace paddle { namespace operators { namespace data { template -class RandomCropAndResizeCPUKernel : public framework::OpKernel { +class BatchRandomCropAndResizeCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { // no cpu kernel. PADDLE_THROW(platform::errors::Unimplemented( - "RandomCropAndResize op only supports GPU now.")); + "BatchRandomCropAndResize op only supports GPU now.")); } }; diff --git a/paddle/fluid/operators/data/unity_build_rule.cmake b/paddle/fluid/operators/data/unity_build_rule.cmake index 99146bf156dd69..354e611aa570bd 100644 --- a/paddle/fluid/operators/data/unity_build_rule.cmake +++ b/paddle/fluid/operators/data/unity_build_rule.cmake @@ -15,7 +15,7 @@ register_unity_group(cc batch_decode_op.cc batch_resize_op.cc mirror_normalize_op.cc - random_crop_and_resize_op.cc) + batch_random_crop_and_resize_op.cc) register_unity_group(cu dataloader_op.cu.cc @@ -24,4 +24,4 @@ register_unity_group(cu batch_decode_op.cu batch_resize_op.cu mirror_normalize_op.cu - random_crop_and_resize_op.cu) + batch_random_crop_and_resize_op.cu) diff --git a/python/paddle/tests/test_ops_crop_resize.py b/python/paddle/tests/test_ops_crop_resize.py index 44efa0f5aac8d8..551851f97ae1df 100644 --- a/python/paddle/tests/test_ops_crop_resize.py +++ b/python/paddle/tests/test_ops_crop_resize.py @@ -20,7 +20,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core -from paddle.vision.ops import image_resize +from paddle.vision.ops import image_resize, random_crop_and_resize def np_nearest_interp(image, @@ -166,13 +166,13 @@ def np_image_resize(images, size, interp_method, class TestImageResizeNearestNCHW(unittest.TestCase): def setup(self): - self.image_shape1 = [3, 8, 8] - self.image_shape2 = [3, 2, 2] - self.size = (4, 4) + self.image_shape1 = [3, 32, 32] + self.image_shape2 = [3, 16, 16] + self.size = (20, 30) self.interp_method = "nearest" self.data_format = "NCHW" self.align_corners = False - self.align_mode = 0 + self.align_mode = 1 self._is_np_built = False self.build_np_data() @@ -375,5 +375,228 @@ def setup(self): self.build_np_data() +class TestImageCropResizeNearestNCHW(unittest.TestCase): + def setup(self): + self.image_shape1 = [3, 16, 16] + self.image_shape2 = [3, 32, 32] + self.size = (20, 30) + self.interp_method = "nearest" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 1 + + self.out_shape = (2, 3, 20, 30) + + self._is_np_built = False + self.build_np_data() + + def build_np_data(self): + if not self._is_np_built: + self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") + self._is_np_built = True + + def test_output_dynamic(self): + if not core.is_compiled_with_cuda(): + return + + paddle.disable_static() + self.setup() + + images = paddle.tensor.create_array(dtype="uint8") + images = paddle.tensor.array_write(paddle.to_tensor(self.image1), + paddle.to_tensor(0), images) + images = paddle.tensor.array_write(paddle.to_tensor(self.image2), + paddle.to_tensor(1), images) + + # NOTE: image_resize takes TensorArray as input, which cannot + # create by Python API in dynamic mode + try: + dy_result = random_crop_and_resize( + images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + except: + pass + + def test_output_static(self): + if not core.is_compiled_with_cuda(): + return + + paddle.enable_static() + self.setup() + + images = paddle.tensor.create_array(dtype="uint8") + + idx = fluid.layers.fill_constant(shape=[1], dtype="int64", value=0) + image1 = fluid.layers.assign(self.image1.astype('int32')) + image1 = fluid.layers.cast(image1, dtype='uint8') + images = paddle.tensor.array_write(image1, idx, images) + + image2 = fluid.layers.assign(self.image2.astype('int32')) + image2 = fluid.layers.cast(image2, dtype='uint8') + images = paddle.tensor.array_write(image2, idx + 1, images) + + out = random_crop_and_resize( + images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + result, = exe.run(paddle.static.default_main_program(), + fetch_list=[out]) + assert result.shape == self.out_shape + + paddle.disable_static() + + +class TestImageCropResizeNearestNHWC(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [16, 16, 3] + self.image_shape2 = [32, 32, 3] + self.size = 20 + self.interp_method = "nearest" + self.data_format = "NHWC" + self.align_corners = False + self.align_mode = 1 + + self.out_shape = (2, 20, 20, 3) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeNearestNCHWAlignCorner(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [3, 16, 16] + self.image_shape2 = [3, 32, 32] + self.size = 20 + self.interp_method = "nearest" + self.data_format = "NCHW" + self.align_corners = True + self.align_mode = 1 + + self.out_shape = (2, 3, 20, 20) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeNearestNHWCAlignCorner(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [16, 16, 3] + self.image_shape2 = [32, 32, 3] + self.size = (20, 30) + self.interp_method = "nearest" + self.data_format = "NHWC" + self.align_corners = True + self.align_mode = 1 + + self.out_shape = (2, 20, 30, 3) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeBilinearNCHW(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [3, 16, 16] + self.image_shape2 = [3, 32, 32] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 1 + + self.out_shape = (2, 3, 20, 30) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeNearestNHWC(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [16, 16, 3] + self.image_shape2 = [32, 32, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = False + self.align_mode = 1 + + self.out_shape = (2, 20, 30, 3) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeBilinearNCHWAlignMode0(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [3, 16, 16] + self.image_shape2 = [3, 32, 32] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = False + self.align_mode = 0 + + self.out_shape = (2, 3, 20, 30) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeNearestNHWCAlignMode0(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [16, 16, 3] + self.image_shape2 = [32, 32, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = False + self.align_mode = 0 + + self.out_shape = (2, 20, 30, 3) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeBilinearNCHWAlignCorner(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [3, 16, 16] + self.image_shape2 = [3, 32, 32] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NCHW" + self.align_corners = True + self.align_mode = 1 + + self.out_shape = (2, 3, 20, 30) + + self._is_np_built = False + self.build_np_data() + + +class TestImageCropResizeNearestNHWCAlignCorner(TestImageCropResizeNearestNCHW): + def setup(self): + self.image_shape1 = [16, 16, 3] + self.image_shape2 = [32, 32, 3] + self.size = (20, 30) + self.interp_method = "bilinear" + self.data_format = "NHWC" + self.align_corners = True + self.align_mode = 1 + + self.out_shape = (2, 20, 30, 3) + + self._is_np_built = False + self.build_np_data() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index a4539a9ea58727..1fd6b75e712959 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1508,12 +1508,15 @@ def forward(self, x, boxes, boxes_num, aligned=True): def random_crop_and_resize(x, size, - scale=(0.08, 1.0), - ratio=(3. / 4., 4. / 3.), + aspect_ratio_min=3./4., + aspect_ratio_max=4./3., + area_min=0.08, + area_max=1., + num_attempts=10, interp_method='bilinear', align_corners=True, align_mode=1, - data_layout='NCHW', + data_format='NCHW', seed=0, name=None): """ @@ -1540,7 +1543,7 @@ def random_crop_and_resize(x, align_mode (int32, optional): Optional for bilinear interpolation, can be 0 for src_idx = scale*(dst_indx+0.5)-0.5, can be 1 for src_idx = scale*dst_index. Default: 1 - data_layout (str, optional): Only used in an optional string + data_format (str, optional): Only used in an optional string from: NHWC, NCHW. Specify that the data format of the input and output data is channel_first or channel_last. Default: NCHW seed (int, optional): The random seed. Default: 0 @@ -1556,42 +1559,46 @@ def random_crop_and_resize(x, .. code-block:: python import paddle - from paddle.vision.ops import random_crop_and_resize + from paddle.vision.ops import batch_random_crop_and_resize data = paddle.rand([3, 256, 256]) - out = random_crop_and_resize([data]) + out = batch_random_crop_and_resize([data]) """ - check_type(size, 'size', (int, tuple), 'random_crop_and_resize') - check_type(scale, 'scale', (list, tuple), 'random_crop_and_resize') - check_type(ratio, 'ratio', (list, tuple), 'random_crop_and_resize') + check_type(size, 'size', (int, tuple), 'batch_random_crop_and_resize') assert interp_method in ['bilinear', 'nearest'] - assert data_layout in ['NCHW', 'NHWC'] + assert data_format in ['NCHW', 'NHWC'] if isinstance(size, int): size = (size, size) if in_dygraph_mode(): - out = _C_ops.random_crop_and_resize( - x, "size", size, "scale", scale, "ratio", ratio, "interp_method", - interp_method, "align_corners", align_corners, "align_mode", - align_mode, "data_layout", data_layout, "seed", seed) + out = _C_ops.batch_random_crop_and_resize( + x, "size", size, "aspect_ratio_min", aspect_ratio_min, + "aspect_ratio_max", aspect_ratio_max, "area_max", area_max, + "area_min", area_min, "num_attempts", num_attempts, + "interp_method", interp_method, "align_corners", + align_corners, "align_mode", align_mode, + "data_format", data_format, "seed", seed) return out - helper = LayerHelper('random_crop_and_resize', **locals()) + helper = LayerHelper('batch_random_crop_and_resize', **locals()) dtype = helper.input_dtype() out = helper.create_variable_for_type_inference(dtype) inputs = {"X": x} attrs = { "size": size, - "scale": scale, - "ratio": ratio, + "aspect_ratio_min": aspect_ratio_min, + "aspect_ratio_max": aspect_ratio_max, + "area_min": area_min, + "area_max": area_max, + "num_attempts": num_attempts, "interp_method": interp_method, "align_corners": align_corners, "align_mode": align_mode, - "data_layout": data_layout, + "data_format": data_format, "seed": seed, } helper.append_op( - type="random_crop_and_resize", + type="batch_random_crop_and_resize", inputs=inputs, outputs={"Out": out}, attrs=attrs) From a738cfbd23ffacbe6faca4e977ef86fceeaf47cc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 1 Apr 2022 08:48:57 +0000 Subject: [PATCH 81/95] add image decode unittest. test=develop --- CMakeLists.txt | 2 +- .../fluid/operators/data/batch_decode_op.cc | 6 - .../fluid/operators/data/batch_decode_op.cu | 3 +- .../data/batch_decode_random_crop_op.cc | 8 +- .../data/batch_decode_random_crop_op.cu | 15 +- paddle/fluid/operators/data/image_decoder.cc | 42 +---- paddle/fluid/operators/data/image_decoder.h | 14 +- python/paddle/tests/test_ops_decode.py | 155 ++++++++++++++++++ python/paddle/vision/ops.py | 21 +-- python/paddle/vision/reader.py | 4 - 10 files changed, 185 insertions(+), 85 deletions(-) create mode 100644 python/paddle/tests/test_ops_decode.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 84b01ad2f89768..9f986dcb59f3fa 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,7 +244,7 @@ option(NEW_RELEASE_ALL "PaddlePaddle next-level release strategy for all arche option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF) option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF) option(WITH_POCKETFFT "Compile with pocketfft support" ON) -option(WITH_OPENCV "Compile with opencv" ON) +option(WITH_OPENCV "Compile with opencv" OFF) option(WITH_RECORD_BUILDTIME "Compile PaddlePaddle with record all targets build time" OFF) option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index 170cb0d3497005..648f2033e9a037 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -69,12 +69,6 @@ and 255. )DOC"); AddAttr("num_threads", "Path of the file to be readed.") .SetDefault(2); - AddAttr( - "mode", - "(string, default \"unchanged\"), The read mode used " - "for optionally converting the image, can be \"unchanged\" " - ",\"gray\" , \"rgb\" .") - .SetDefault("unchanged"); AddAttr("local_rank", "(int)" "The index of the op to start execution"); diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index dfd45a3a0073e0..93b93b9c674f87 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -28,7 +28,6 @@ class GPUBatchDecodeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int num_threads = ctx.Attr("num_threads"); - auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); auto program_id = ctx.Attr("program_id"); auto host_memory_padding = ctx.Attr("host_memory_padding"); @@ -37,7 +36,7 @@ class GPUBatchDecodeKernel : public framework::OpKernel { // multi-phrase decode thread pool auto* decode_pool = ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( - program_id, num_threads, mode, local_rank, + program_id, num_threads, local_rank, static_cast(host_memory_padding), static_cast(device_memory_padding)); diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 1a211332d2a30a..3e981a201fd477 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -110,12 +110,6 @@ and 255. "The index of the op to start execution"); AddAttr("num_threads", "Path of the file to be readed.") .SetDefault(2); - AddAttr( - "mode", - "(string, default \"unchanged\"), The read mode used " - "for optionally converting the image, can be \"unchanged\" " - ",\"gray\" , \"rgb\" .") - .SetDefault("unchanged"); AddAttr( "host_memory_padding", "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding") @@ -125,7 +119,7 @@ and 255. "(int64, default 0), device memory allocation padding number for Nvjpeg decoding") .SetDefault(0); AddAttr( - "data_layout", + "data_format", "(string, default NCHW) Only used in " "an optional string from: \"NHWC\", \"NCHW\". " "Specify that the data format of the input and output data is " diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index b2dc8a8238be8c..e65bce62e2536b 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -35,7 +35,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int num_threads = ctx.Attr("num_threads"); - auto mode = ctx.Attr("mode"); auto local_rank = ctx.Attr("local_rank"); auto program_id = ctx.Attr("program_id"); auto host_memory_padding = ctx.Attr("host_memory_padding"); @@ -44,7 +43,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { // multi-phrase decode thread pool auto* decode_pool = ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( - program_id, num_threads, mode, local_rank, + program_id, num_threads, local_rank, static_cast(host_memory_padding), static_cast(device_memory_padding)); @@ -58,12 +57,12 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto& out_array = *out->GetMutable(); out_array.resize(batch_size); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_format = + framework::StringToDataLayout(data_format_str); framework::LoDTensorArray temp_array; - if (data_layout == DataLayout::kNCHW) { + if (data_format == DataLayout::kNCHW) { temp_array.resize(batch_size); } @@ -84,7 +83,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto* x_data = x.data(); size_t x_numel = static_cast(x.numel()); - if (data_layout == DataLayout::kNCHW){ + if (data_format == DataLayout::kNCHW){ ImageDecodeTask task = { .bit_stream = x_data, .bit_len = x_numel, @@ -109,7 +108,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { decode_pool->RunAll(true); - if (data_layout == DataLayout::kNCHW){ + if (data_format == DataLayout::kNCHW){ const auto& dev_ctx = ctx.cuda_device_context(); phi::funcs::Transpose trans; std::vector axis = {2, 0, 1}; diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc index afac0aad3892ed..4c39030d1ad4f2 100644 --- a/paddle/fluid/operators/data/image_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -18,13 +18,12 @@ namespace paddle { namespace operators { namespace data { -ImageDecoder::ImageDecoder(std::string mode, int dev_id, +ImageDecoder::ImageDecoder(int dev_id, size_t host_memory_padding, size_t device_memory_padding) : nvjpeg_streams_(2), pinned_buffers_(2), - page_id_(0), - mode_(mode) { + page_id_(0) { platform::SetDeviceId(dev_id); // create nvjpeg handle and stream @@ -141,33 +140,7 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( int64_t width = static_cast(widths[0]); int64_t height = static_cast(heights[0]); - nvjpegOutputFormat_t output_format; - int output_components; - - if (mode_ == "unchanged") { - if (components == 1) { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (components == 3) { - output_format = NVJPEG_OUTPUT_RGBI; - output_components = 3; - } else { - PADDLE_THROW(platform::errors::Fatal( - "The provided mode(%s) does not support components(%d)", - mode_, components)); - } - } else if (mode_ == "gray") { - output_format = NVJPEG_OUTPUT_Y; - output_components = 1; - } else if (mode_ == "rgb") { - output_format = NVJPEG_OUTPUT_RGBI; - output_components = 3; - } else { - PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); - } - - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, NVJPEG_OUTPUT_RGBI)); if (roi_generator) { ROI roi; @@ -178,13 +151,13 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( width = roi.w; } - std::vector out_shape = {height, width, output_components}; + std::vector out_shape = {height, width, 3}; out->Resize(phi::make_ddim(out_shape)); // allocate memory and assign to out_image auto* data = out->mutable_data(place); out_image->channel[0] = data; - out_image->pitch[0] = output_components * width; + out_image->pitch[0] = width * 3; return NVJPEG_STATUS_SUCCESS; } @@ -223,10 +196,9 @@ void ImageDecoder::Run( } ImageDecoderThreadPool::ImageDecoderThreadPool( - const int num_threads, const std::string mode, const int dev_id, + const int num_threads, const int dev_id, const size_t host_memory_padding, const size_t device_memory_padding) : threads_(num_threads), - mode_(mode), dev_id_(dev_id), shutdown_(false), running_(false), @@ -294,7 +266,7 @@ void ImageDecoderThreadPool::SortTaskByLengthDescend() { void ImageDecoderThreadPool::ThreadLoop( const int thread_idx, const size_t host_memory_padding, const size_t device_memory_padding) { - ImageDecoder* decoder = new ImageDecoder(mode_, dev_id_, + ImageDecoder* decoder = new ImageDecoder(dev_id_, host_memory_padding, device_memory_padding); diff --git a/paddle/fluid/operators/data/image_decoder.h b/paddle/fluid/operators/data/image_decoder.h index 5ece77a845dfca..de332f0a2b963f 100644 --- a/paddle/fluid/operators/data/image_decoder.h +++ b/paddle/fluid/operators/data/image_decoder.h @@ -55,7 +55,7 @@ struct ImageDecodeTask { class ImageDecoder { public: - ImageDecoder(const std::string mode, int dev_id, + ImageDecoder(int dev_id, size_t host_memory_padding=0, size_t device_memory_padding=0); @@ -94,14 +94,12 @@ class ImageDecoder { nvjpegBufferDevice_t device_buffer_ = nullptr; int page_id_; - - const std::string mode_; }; class ImageDecoderThreadPool { public: - ImageDecoderThreadPool(const int num_threads, const std::string mode, - const int dev_id, size_t host_memory_padding, + ImageDecoderThreadPool(const int num_threads, const int dev_id, + size_t host_memory_padding, size_t device_memory_padding); ~ImageDecoderThreadPool(); @@ -123,7 +121,6 @@ class ImageDecoderThreadPool { const size_t device_memory_padding); std::vector threads_; - std::string mode_; int dev_id_; std::deque> task_queue_; @@ -159,15 +156,14 @@ class ImageDecoderThreadPoolManager { } ImageDecoderThreadPool* GetDecoderThreadPool( - const int64_t program_id, const int num_threads, - const std::string mode, const int dev_id, + const int64_t program_id, const int num_threads, const int dev_id, const size_t host_memory_padding, const size_t device_memory_padding) { auto iter = prog_id_to_pool_.find(program_id); if (iter == prog_id_to_pool_.end()) { prog_id_to_pool_[program_id] = std::unique_ptr( - new ImageDecoderThreadPool(num_threads, mode, dev_id, + new ImageDecoderThreadPool(num_threads, dev_id, host_memory_padding, device_memory_padding)); } diff --git a/python/paddle/tests/test_ops_decode.py b/python/paddle/tests/test_ops_decode.py new file mode 100644 index 00000000000000..95128be5f240cc --- /dev/null +++ b/python/paddle/tests/test_ops_decode.py @@ -0,0 +1,155 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.utils.download import get_path_from_url +from paddle.vision.datasets import DatasetFolder +from paddle.vision.ops import image_decode, image_decode_random_crop +from paddle.vision.reader import file_label_loader + + +DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") +DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" +DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + + +class TestImageReaderDecodeCase1(unittest.TestCase): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.batch_size = 16 + self.num_threads = 2 + self.host_memory_padding = 1000000 + self.device_memory_padding = 1000000 + + def test_static_output(self): + paddle.enable_static() + self.setup() + + indices = paddle.arange(self.batch_size) + image, label = file_label_loader(self.data_root, indices) + image = image_decode(image, + num_threads=self.num_threads) + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + out_image, out_label = exe.run(paddle.static.default_main_program(), + fetch_list=[image, label]) + + assert len(out_image) == self.batch_size + for i in range(self.batch_size): + img = np.array(out_image[i]) + assert img.dtype == np.uint8 + assert img.shape[2] == 3 + assert np.all(img >= 0) + assert np.all(img <= 255) + + assert len(out_label) == self.batch_size + assert label.dtype == paddle.int64 + label = np.array(out_label) + assert np.all(label >= 0) + assert np.all(label <= 1) + + +class TestImageReaderDecodeCase2(TestImageReaderDecodeCase1): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.batch_size = 32 + self.num_threads = 4 + self.host_memory_padding = 0 + self.device_memory_padding = 0 + + +class TestImageReaderDecodeRandomCropNCHW(unittest.TestCase): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.batch_size = 16 + self.num_threads = 2 + self.host_memory_padding = 1000000 + self.device_memory_padding = 1000000 + + self.aspect_ratio_min = 3. / 4. + self.aspect_ratio_max = 4. / 3. + self.area_min = 0.08 + self.area_max = 1.0 + self.num_attempts = 10 + + self.data_format = "NCHW" + self.channel_dim = 0 + + def test_static_output(self): + paddle.enable_static() + self.setup() + + indices = paddle.arange(self.batch_size) + image, label = file_label_loader(self.data_root, indices) + image = image_decode_random_crop(image, + num_threads=self.num_threads, + aspect_ratio_min=self.aspect_ratio_min, + aspect_ratio_max=self.aspect_ratio_max, + area_min=self.area_min, + area_max=self.area_max, + num_attempts=self.num_attempts, + data_format=self.data_format) + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + out_image, out_label = exe.run(paddle.static.default_main_program(), + fetch_list=[image, label]) + + assert len(out_image) == self.batch_size + for i in range(self.batch_size): + img = np.array(out_image[i]) + assert img.dtype == np.uint8 + assert img.shape[self.channel_dim] == 3 + assert np.all(img >= 0) + assert np.all(img <= 255) + + assert len(out_label) == self.batch_size + assert label.dtype == paddle.int64 + label = np.array(out_label) + assert np.all(label >= 0) + assert np.all(label <= 1) + + +class TestImageReaderDecodeRandomCropNHWC(TestImageReaderDecodeRandomCropNCHW): + def setup(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.batch_size = 16 + self.num_threads = 4 + self.host_memory_padding = 0 + self.device_memory_padding = 0 + + self.aspect_ratio_min = 4. / 5. + self.aspect_ratio_max = 5. / 4. + self.area_min = 0.1 + self.area_max = 0.9 + self.num_attempts = 20 + + self.data_format = "NHWC" + self.channel_dim = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 1fd6b75e712959..7b6fd81a57869f 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -868,7 +868,7 @@ def read_file(filename, name=None): return out -def image_decode(x, mode='unchanged', num_threads=2, +def image_decode(x, num_threads=2, host_memory_padding=0, device_memory_padding=0, name=None): """ @@ -912,14 +912,13 @@ def image_decode(x, mode='unchanged', num_threads=2, core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode( - x, out, "mode", mode, "num_threads", num_threads, + x, out, "num_threads", num_threads, "local_rank", local_rank, "program_id", program_id, "host_memory_padding", host_memory_padding, "device_memory_padding", device_memory_padding) inputs = {'X': x} - attrs = {"mode": mode, - "num_threads": num_threads, + attrs = {"num_threads": num_threads, "local_rank": local_rank, "program_id": utils._hash_with_id(default_main_program()), "host_memory_padding": host_memory_padding, @@ -937,11 +936,10 @@ def image_decode(x, mode='unchanged', num_threads=2, def image_decode_random_crop(x, - mode='unchanged', num_threads=2, host_memory_padding=0, device_memory_padding=0, - data_layout='NCHW', + data_format='NCHW', aspect_ratio_min=3./4., aspect_ratio_max=4./3., area_min=0.08, @@ -956,8 +954,6 @@ def image_decode_random_crop(x, Args: x (Tensor): A one dimensional uint8 tensor containing the raw bytes of the JPEG image. - mode (str): The read mode used for optionally converting the image. - Default: 'unchanged'. num_threads (int): parallel thread number. aspect_ratio_min (float): aspect_ratio_max (float): @@ -992,8 +988,8 @@ def image_decode_random_crop(x, core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode_random_crop( - x, out, "mode", mode, "num_threads", num_threads, - "data_layout", data_layout, "aspect_ratio_min", + x, out, "num_threads", num_threads, + "data_format", data_format, "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, "area_max", area_max, "num_attempts", num_attempts, "local_rank", local_rank, @@ -1002,11 +998,10 @@ def image_decode_random_crop(x, "device_memory_padding", device_memory_padding) inputs = {'X': x} - attrs = {"mode": mode, - "num_threads": num_threads, + attrs = {"num_threads": num_threads, "host_memory_padding": host_memory_padding, "device_memory_padding": device_memory_padding, - "data_layout": data_layout, + "data_format": data_format, "aspect_ratio_min": aspect_ratio_min, "aspect_ratio_max": aspect_ratio_max, "area_min": area_min, diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 061ccb9049a21d..4009b787890413 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -92,10 +92,6 @@ def file_label_loader(data_root, indices, name=None): need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. """ - # from paddle.vision.datasets import DatasetFolder - # data_folder = DatasetFolder(data_root) - # samples = [s[0] for s in data_folder.samples] - # targets = [s[1] for s in data_folder.samples] if in_dygraph_mode(): image = core.VarBase(core.VarDesc.VarType.UINT8, [], From e1bf5f1f258487f66e8a1c817d86be4e82bce499 Mon Sep 17 00:00:00 2001 From: LielinJiang Date: Fri, 1 Apr 2022 11:38:54 +0000 Subject: [PATCH 82/95] lod_tensor_array to list[lod_tensor] --- .../data/batch_decode_random_crop_op.cc | 95 +++++---- .../data/batch_decode_random_crop_op.cu | 82 ++++---- .../fluid/operators/data/batch_resize_op.cc | 34 ++-- .../fluid/operators/data/batch_resize_op.cu | 81 ++++---- .../operators/data/file_label_loader_op.cc | 12 +- .../operators/data/file_label_loader_op.h | 60 +++--- paddle/fluid/operators/data/image_decoder.cc | 191 +++++++++++------- python/paddle/fluid/dataloader/ops.py | 87 +++++--- python/paddle/vision/ops.py | 134 ++++++------ python/paddle/vision/reader.py | 78 +++---- 10 files changed, 467 insertions(+), 387 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 1a211332d2a30a..735c4286ab8e2a 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -23,21 +23,29 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); - + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(X) of DecodeJpeg should not be empty.")); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of DecodeJpeg should not be empty.")); auto aspect_ratio_min = ctx->Attrs().Get("aspect_ratio_min"); auto aspect_ratio_max = ctx->Attrs().Get("aspect_ratio_max"); - PADDLE_ENFORCE_GT(aspect_ratio_min, 0., - platform::errors::InvalidArgument( + PADDLE_ENFORCE_GT( + aspect_ratio_min, 0., + platform::errors::InvalidArgument( "aspect_ratio_min should be greater than 0, but received " - "%f", aspect_ratio_min)); - PADDLE_ENFORCE_GT(aspect_ratio_max, 0., - platform::errors::InvalidArgument( + "%f", + aspect_ratio_min)); + PADDLE_ENFORCE_GT( + aspect_ratio_max, 0., + platform::errors::InvalidArgument( "aspect_ratio_max should be greater than 0, but received " - "%f", aspect_ratio_max)); - PADDLE_ENFORCE_GE(aspect_ratio_max, aspect_ratio_min, - platform::errors::InvalidArgument( + "%f", + aspect_ratio_max)); + PADDLE_ENFORCE_GE( + aspect_ratio_max, aspect_ratio_min, + platform::errors::InvalidArgument( "aspect_ratio_max should be greater than aspect_ratio_min, " "but received aspect_ratio_max(%d) < aspect_ratio_min(%d)", aspect_ratio_max, aspect_ratio_min)); @@ -45,31 +53,34 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel { auto area_min = ctx->Attrs().Get("area_min"); auto area_max = ctx->Attrs().Get("area_max"); PADDLE_ENFORCE_GT(area_min, 0., - platform::errors::InvalidArgument( - "area_minshould be greater than 0, but received " - "%f", area_min)); + platform::errors::InvalidArgument( + "area_minshould be greater than 0, but received " + "%f", + area_min)); PADDLE_ENFORCE_GT(area_max, 0., - platform::errors::InvalidArgument( - "area_max should be greater than 0, but received " - "%f", area_max)); + platform::errors::InvalidArgument( + "area_max should be greater than 0, but received " + "%f", + area_max)); PADDLE_ENFORCE_GE(area_max, area_min, - platform::errors::InvalidArgument( - "area_max should be greater than area_min, " - "but received area_max(%f) < area_min(%f)", - area_max, area_min)); + platform::errors::InvalidArgument( + "area_max should be greater than area_min, " + "but received area_max(%f) < area_min(%f)", + area_max, area_min)); - auto num_attempts= ctx->Attrs().Get("num_attempts"); + auto num_attempts = ctx->Attrs().Get("num_attempts"); PADDLE_ENFORCE_GT(num_attempts, 0, - platform::errors::InvalidArgument( - "num_attempts should be a positive integerm, but " - "received %d", num_attempts)); + platform::errors::InvalidArgument( + "num_attempts should be a positive integerm, but " + "received %d", + num_attempts)); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::UINT8, ctx.GetPlace()); + return framework::OpKernelType(framework::proto::VarType::UINT8, + ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -97,8 +108,9 @@ class BatchDecodeRandomCropOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "A one dimensional uint8 tensor containing the raw bytes " - "of the JPEG image. It is a tensor with rank 1."); - AddOutput("Out", "The output tensor of DecodeJpeg op"); + "of the JPEG image. It is a tensor with rank 1.") + .AsDuplicable(); + AddOutput("Out", "The output tensor of DecodeJpeg op").AsDuplicable(); AddComment(R"DOC( This operator decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the @@ -108,21 +120,20 @@ and 255. AddAttr("local_rank", "(int64_t)" "The index of the op to start execution"); - AddAttr("num_threads", "Path of the file to be readed.") - .SetDefault(2); + AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); AddAttr( "mode", "(string, default \"unchanged\"), The read mode used " "for optionally converting the image, can be \"unchanged\" " ",\"gray\" , \"rgb\" .") .SetDefault("unchanged"); - AddAttr( - "host_memory_padding", - "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding") + AddAttr("host_memory_padding", + "(int64, default 0), pinned memory allocation padding " + "number for Nvjpeg decoding") .SetDefault(0); - AddAttr( - "device_memory_padding", - "(int64, default 0), device memory allocation padding number for Nvjpeg decoding") + AddAttr("device_memory_padding", + "(int64, default 0), device memory allocation padding " + "number for Nvjpeg decoding") .SetDefault(0); AddAttr( "data_layout", @@ -131,8 +142,8 @@ and 255. "Specify that the data format of the input and output data is " "channel_first or channel_last.") .SetDefault("NCHW"); - AddAttr("aspect_ratio_min", "").SetDefault(3./4.); - AddAttr("aspect_ratio_max", "").SetDefault(4./3.); + AddAttr("aspect_ratio_min", "").SetDefault(3. / 4.); + AddAttr("aspect_ratio_max", "").SetDefault(4. / 3.); AddAttr("area_min", "").SetDefault(0.08); AddAttr("area_max", "").SetDefault(1.); AddAttr("num_attempts", "").SetDefault(10); @@ -150,8 +161,10 @@ and 255. namespace ops = paddle::operators; REGISTER_OPERATOR( - batch_decode_random_crop, ops::data::BatchDecodeRandomCropOp, ops::data::BatchDecodeRandomCropOpMaker, + batch_decode_random_crop, ops::data::BatchDecodeRandomCropOp, + ops::data::BatchDecodeRandomCropOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker) -REGISTER_OP_CPU_KERNEL(batch_decode_random_crop, ops::data::CPUBatchDecodeRandomCropKernel) +REGISTER_OP_CPU_KERNEL(batch_decode_random_crop, + ops::data::CPUBatchDecodeRandomCropKernel) diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index b2dc8a8238be8c..638ada7256cee4 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -25,7 +25,8 @@ namespace paddle { namespace operators { namespace data { -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; using DataLayout = framework::DataLayout; ImageDecoderThreadPool* decode_pool = nullptr; @@ -42,21 +43,17 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto device_memory_padding = ctx.Attr("device_memory_padding"); // multi-phrase decode thread pool - auto* decode_pool = - ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( - program_id, num_threads, mode, local_rank, - static_cast(host_memory_padding), - static_cast(device_memory_padding)); + auto* decode_pool = + ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool( + program_id, num_threads, mode, local_rank, + static_cast(host_memory_padding), + static_cast(device_memory_padding)); - const framework::LoDTensorArray* inputs = - ctx.Input("X"); - int batch_size = inputs->size(); + auto inputs = ctx.MultiInput("X"); + int batch_size = inputs.size(); - auto* out = ctx.OutputVar("Out"); + auto out_array = ctx.MultiOutput("Out"); auto dev = platform::CUDAPlace(local_rank); - - auto& out_array = *out->GetMutable(); - out_array.resize(batch_size); const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = @@ -76,52 +73,46 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { AreaRange area_range{area_min, area_max}; auto* generators = GeneratorManager::Instance()->GetGenerators( - program_id, batch_size, aspect_ratio_range, - area_range); - - for (size_t i = 0; i < inputs->size(); i++) { - const framework::LoDTensor x = inputs->at(i); - auto* x_data = x.data(); - size_t x_numel = static_cast(x.numel()); - - if (data_layout == DataLayout::kNCHW){ - ImageDecodeTask task = { - .bit_stream = x_data, - .bit_len = x_numel, - .tensor = &temp_array[i], - .roi_generator = generators->at(i).get(), - .place = dev - }; + program_id, batch_size, aspect_ratio_range, area_range); + + for (size_t i = 0; i < inputs.size(); i++) { + const framework::LoDTensor* x = inputs.at(i); + auto* x_data = x->data(); + size_t x_numel = static_cast(x->numel()); + + if (data_layout == DataLayout::kNCHW) { + ImageDecodeTask task = {.bit_stream = x_data, + .bit_len = x_numel, + .tensor = &temp_array[i], + .roi_generator = generators->at(i).get(), + .place = dev}; decode_pool->AddTask(std::make_shared(task)); - } - else{ - ImageDecodeTask task = { - .bit_stream = x_data, - .bit_len = x_numel, - .tensor = &out_array[i], - .roi_generator = generators->at(i).get(), - .place = dev - }; + } else { + ImageDecodeTask task = {.bit_stream = x_data, + .bit_len = x_numel, + .tensor = out_array[i], + .roi_generator = generators->at(i).get(), + .place = dev}; decode_pool->AddTask(std::make_shared(task)); } - } decode_pool->RunAll(true); - if (data_layout == DataLayout::kNCHW){ + if (data_layout == DataLayout::kNCHW) { const auto& dev_ctx = ctx.cuda_device_context(); phi::funcs::Transpose trans; std::vector axis = {2, 0, 1}; - for (size_t i = 0; i < inputs->size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { // Do transpose const framework::DDim& in_sizes = temp_array[i].dims(); framework::DDim transposed_input_shape = in_sizes.transpose(axis); std::vector transposed_input_shape_ = phi::vectorize(transposed_input_shape); - out_array[i].Resize(transposed_input_shape); - out_array[i].mutable_data(dev_ctx.GetPlace()); - trans(dev_ctx, temp_array[i], &out_array[i], axis); + + out_array[i]->Resize(transposed_input_shape); + out_array[i]->mutable_data(dev_ctx.GetPlace()); + trans(dev_ctx, temp_array[i], out_array[i], axis); } } } @@ -132,6 +123,7 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(batch_decode_random_crop, ops::data::GPUBatchDecodeRandomCropKernel) +REGISTER_OP_CUDA_KERNEL(batch_decode_random_crop, + ops::data::GPUBatchDecodeRandomCropKernel) #endif diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc index d3fbbfd17f58ad..e46f12cb6b23ed 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cc +++ b/paddle/fluid/operators/data/batch_resize_op.cc @@ -24,28 +24,27 @@ class BatchResizeOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchResize"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "BatchResize"); + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(X) of BatchResize should not be empty.")); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BatchResize"); auto size = ctx->Attrs().Get>("size"); PADDLE_ENFORCE_EQ(size.size(), 2, platform::errors::InvalidArgument( "The length of Attrs(size) should be 2.")); - PADDLE_ENFORCE_GT(size[0], 0, - platform::errors::InvalidArgument( - "h in Attr(size) of Op(BatchResize) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(size[1], 0, - platform::errors::InvalidArgument( - "w in Attr(size) of Op(BatchResize) " - "should be greater than 0.")); + PADDLE_ENFORCE_GT(size[0], 0, platform::errors::InvalidArgument( + "h in Attr(size) of Op(BatchResize) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(size[1], 0, platform::errors::InvalidArgument( + "w in Attr(size) of Op(BatchResize) " + "should be greater than 0.")); } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::UINT8, ctx.GetPlace()); + return framework::OpKernelType(framework::proto::VarType::UINT8, + ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -62,7 +61,8 @@ class BatchResizeOp : public framework::OperatorWithKernel { class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); + AddInput("X", "(List[LoDTensor]). A batch of instances to random crop.") + .AsDuplicable(); AddOutput("Out", "(Tensor). The cropped instance batch."); AddAttr>( "size", "expected output size of the crop, for each edge."); @@ -103,10 +103,8 @@ class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR( - batch_resize, ops::data::BatchResizeOp, - ops::data::BatchResizeOpMaker, + batch_resize, ops::data::BatchResizeOp, ops::data::BatchResizeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(batch_resize, - ops::data::BatchResizeCPUKernel) +REGISTER_OP_CPU_KERNEL(batch_resize, ops::data::BatchResizeCPUKernel) diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index 4953e39801d3de..e1164043b8c117 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -76,10 +76,10 @@ __global__ void KeNearestNeighborInterpFw( template __global__ void KeBilinearInterpFw( const T* in, const size_t in_img_h, const size_t in_img_w, - const size_t input_h, const size_t input_w, T* out, - const size_t out_img_h, const size_t out_img_w, const size_t output_h, - const size_t output_w, const size_t num_channels, const float ratio_h, - const float ratio_w, const bool align_corners, const int align_mode, + const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, + const size_t out_img_w, const size_t output_h, const size_t output_w, + const size_t num_channels, const float ratio_h, const float ratio_w, + const bool align_corners, const int align_mode, const DataLayout data_format) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -114,8 +114,8 @@ __global__ void KeBilinearInterpFw( int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; float src_h = ratio_h * (out_img_idy + 0.5) - 0.5; src_h = src_h > 0 ? src_h : 0; - float h1lambda = align_flag ? src_h - in_img_idy - : ratio_h * out_img_idy - in_img_idy; + float h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; float h2lambda = 1.f - h1lambda; // get input w index with offset @@ -126,8 +126,8 @@ __global__ void KeBilinearInterpFw( int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; float src_w = ratio_w * (out_img_idx + 0.5) - 0.5; src_w = src_w > 0 ? src_w : 0; - float w1lambda = align_flag ? src_w - in_img_idx - : ratio_w * out_img_idx - in_img_idx; + float w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; float w2lambda = 1.f - w1lambda; if (data_format == DataLayout::kNCHW) { @@ -135,33 +135,34 @@ __global__ void KeBilinearInterpFw( in_img_idy * in_img_w + in_img_idx]; // bilinear interpolation - out[out_id_h * output_w + out_id_w] = (T)( - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id])); + out[out_id_h * output_w + out_id_w] = + (T)(h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id])); } else { const T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + in_img_idx * num_channels + channel_id]; // bilinear interpolation - out[out_id_h * output_w + out_id_w] = (T)( - h2lambda * - (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + - w1lambda * in_pos[h_id * in_img_w * num_channels + - w_id * num_channels])); + out[out_id_h * output_w + out_id_w] = + (T)(h2lambda * (w2lambda * in_pos[0] + + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * in_pos[h_id * in_img_w * num_channels + + w_id * num_channels])); } } } template -static void ResizeFwd( - const framework::ExecutionContext& ctx, const framework::LoDTensor& input, - framework::Tensor* output, const std::vector out_size, - const std::string interp_method, const bool align_corners, - const int align_mode, const int img_h, const int img_w, const int c, - const DataLayout data_format) { +static void ResizeFwd(const framework::ExecutionContext& ctx, + const framework::LoDTensor& input, + framework::Tensor* output, + const std::vector out_size, + const std::string interp_method, const bool align_corners, + const int align_mode, const int img_h, const int img_w, + const int c, const DataLayout data_format) { auto input_data = input.template data(); int out_h = static_cast(out_size[0]); int out_w = static_cast(out_size[1]); @@ -201,8 +202,7 @@ static void ResizeFwd( KeBilinearInterpFw<<>>( input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1, - out_chw, c, ratio_h, ratio_w, align_corners, align_mode, - data_format); + out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_format); } } @@ -214,8 +214,8 @@ class BatchResizeCUDAKernel : public framework::OpKernel { platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); // get input, output - auto* x = ctx.Input("X"); - PADDLE_ENFORCE_GT(x->size(), 0, + auto x = ctx.MultiInput("X"); + PADDLE_ENFORCE_GT(x.size(), 0, platform::errors::InvalidArgument( "The size of X must be greater than 0.")); auto* out = ctx.Output("Out"); @@ -231,30 +231,28 @@ class BatchResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); - auto* img = &x->at(0); - int64_t img_c = data_format == DataLayout::kNCHW ? \ - img->dims()[0] : img->dims()[2]; + auto* img = x.at(0); + int64_t img_c = + data_format == DataLayout::kNCHW ? img->dims()[0] : img->dims()[2]; - std::vector out_dim = {static_cast(x->size()), - size[0], size[1], img_c}; + std::vector out_dim = {static_cast(x.size()), size[0], + size[1], img_c}; if (data_format == DataLayout::kNCHW) { - out_dim = {static_cast(x->size()), - img_c, size[0], size[1]}; + out_dim = {static_cast(x.size()), img_c, size[0], size[1]}; } out->Resize(phi::make_ddim(out_dim)); out->mutable_data(ctx.GetPlace()); int img_h, img_w, idx_h, idx_w, crop_h, crop_w; - for (int i = 0; i < x->size(); i++) { - img = &x->at(i); + for (int i = 0; i < x.size(); i++) { + img = x.at(i); img_h = data_format == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = data_format == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1]; auto out_tensor = out->Slice(i, i + 1); - ResizeFwd(ctx, *img, &out_tensor, size, interp_method, - align_corners, align_mode, img_h, img_w, img_c, - data_format); + ResizeFwd(ctx, *img, &out_tensor, size, interp_method, align_corners, + align_mode, img_h, img_w, img_c, data_format); } } }; @@ -264,6 +262,5 @@ class BatchResizeCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(batch_resize, - ops::data::BatchResizeCUDAKernel, +REGISTER_OP_CUDA_KERNEL(batch_resize, ops::data::BatchResizeCUDAKernel, ops::data::BatchResizeCUDAKernel); diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc index 3b26438db00d7f..530d51ec35d358 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.cc +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -27,9 +27,9 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, platform::errors::InvalidArgument( "Input(Indices) of ReadFileLoaderOp is null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true, - platform::errors::InvalidArgument( - "Output(Image) of ReadFileLoaderOp is null.")); + // PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true, + // platform::errors::InvalidArgument( + // "Output(Image) of ReadFileLoaderOp is null.")); PADDLE_ENFORCE_EQ(ctx->HasOutput("Label"), true, platform::errors::InvalidArgument( "Output(Label) of ReadFileLoaderOp is null.")); @@ -51,7 +51,8 @@ class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Indices", "The batch indices of input samples"); - AddOutput("Image", "The output image tensor of ReadFileLoader op"); + AddOutput("Image", "The output image tensor of ReadFileLoader op") + .AsDuplicable(); AddOutput("Label", "The output label tensor of ReadFileLoader op"); AddAttr("data_root", "Path of root directory of dataset"); AddComment(R"DOC( @@ -71,4 +72,5 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker) -REGISTER_OP_CPU_KERNEL(file_label_loader, ops::FileLabelLoaderCPUKernel) +REGISTER_OP_CPU_KERNEL(file_label_loader, + ops::FileLabelLoaderCPUKernel) diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index 7e6b0a555acafe..995c410b7966ed 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -13,19 +13,19 @@ // limitations under the License. #pragma once +#include +#include +#include #include #include -#include #include -#include -#include #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -39,8 +39,7 @@ constexpr char DIR_SEP = '\\'; constexpr char DIR_SEP = '/'; #endif -static std::string JoinPath(const std::string path1, - const std::string path2) { +static std::string JoinPath(const std::string path1, const std::string path2) { // empty check if (path1.empty()) return path2; if (path1.empty()) return path1; @@ -56,11 +55,12 @@ static std::string JoinPath(const std::string path1, return path1 + DIR_SEP + path2; } -static void ParseFilesAndLabels(const std::string data_root, - std::vector>* samples) { +static void ParseFilesAndLabels( + const std::string data_root, + std::vector>* samples) { auto* dir = opendir(data_root.c_str()); PADDLE_ENFORCE_NE(dir, nullptr, platform::errors::InvalidArgument( - "Cannot open directory %s", data_root)); + "Cannot open directory %s", data_root)); // Step 1: parse classes info std::vector classes; @@ -69,13 +69,13 @@ static void ParseFilesAndLabels(const std::string data_root, if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { entry = readdir(dir); continue; - } + } auto cls_path = JoinPath(data_root, entry->d_name); struct stat s; int ret = stat(cls_path.c_str(), &s); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::InvalidArgument( - "Directory %s is unaccessiable.", cls_path)); + "Directory %s is unaccessiable.", cls_path)); if (S_ISDIR(s.st_mode)) classes.emplace_back(entry->d_name); @@ -89,13 +89,12 @@ static void ParseFilesAndLabels(const std::string data_root, // Step 2: traverse directory to generate samples for (int class_id = 0; class_id < static_cast(classes.size()); - class_id++) { - auto cur_dir = data_root + DIR_SEP + classes[class_id]; + class_id++) { + auto cur_dir = data_root + DIR_SEP + classes[class_id]; dir = opendir(cur_dir.c_str()); entry = readdir(dir); while (entry) { - if (strcmp(entry->d_name, ".") == 0 - || strcmp(entry->d_name, "..") == 0) { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { entry = readdir(dir); continue; } @@ -107,12 +106,13 @@ static void ParseFilesAndLabels(const std::string data_root, } closedir(dir); } - } -std::map>> root_to_samples_; +std::map>> + root_to_samples_; -static std::vector>* GetFilesAndLabelsFromCache(const std::string data_root) { +static std::vector>* GetFilesAndLabelsFromCache( + const std::string data_root) { auto iter = root_to_samples_.find(data_root); if (iter == root_to_samples_.end()) { std::vector> samples; @@ -120,16 +120,16 @@ static std::vector>* GetFilesAndLabelsFromCache(cons VLOG(4) << "Init sample number: " << samples.size(); root_to_samples_[data_root] = samples; } - + return &(root_to_samples_[data_root]); } template -class FileLabelLoaderCPUKernel: public framework::OpKernel { +class FileLabelLoaderCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { auto* indices = ctx.Input("Indices"); - auto* image_arr = ctx.Output("Image"); + auto image_arr = ctx.MultiOutput("Image"); auto* label_tensor = ctx.Output("Label"); auto data_root = ctx.Attr("data_root"); @@ -138,11 +138,9 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { auto batch_size = indices->dims()[0]; const int64_t* indices_data = indices->data(); - image_arr->clear(); - image_arr->reserve(batch_size); - label_tensor->Resize( - phi::make_ddim({static_cast(batch_size)})); - auto* label_data = label_tensor->mutable_data(platform::CPUPlace()); + label_tensor->Resize(phi::make_ddim({static_cast(batch_size)})); + auto* label_data = + label_tensor->mutable_data(platform::CPUPlace()); for (int64_t i = 0; i < batch_size; i++) { int64_t index = static_cast(indices_data[i]); auto file = samples->at(index).first; @@ -153,15 +151,14 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { input.seekg(0, std::ios::beg); - framework::LoDTensor image; + auto image = image_arr[i]; std::vector image_len = {file_size}; - image.Resize(phi::make_ddim(image_len)); + image->Resize(phi::make_ddim(image_len)); - uint8_t* data = image.mutable_data(platform::CPUPlace()); + uint8_t* data = image->mutable_data(platform::CPUPlace()); input.read(reinterpret_cast(data), file_size); - image_arr->emplace_back(image); label_data[i] = static_cast(label); } } @@ -174,7 +171,6 @@ class FileLabelLoaderCPUKernel: public framework::OpKernel { framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } - }; } // namespace data diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc index afac0aad3892ed..0d59ffb5118ea9 100644 --- a/paddle/fluid/operators/data/image_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -20,73 +20,88 @@ namespace data { ImageDecoder::ImageDecoder(std::string mode, int dev_id, size_t host_memory_padding, - size_t device_memory_padding) - : nvjpeg_streams_(2), - pinned_buffers_(2), - page_id_(0), - mode_(mode) { + size_t device_memory_padding) + : nvjpeg_streams_(2), pinned_buffers_(2), page_id_(0), mode_(mode) { platform::SetDeviceId(dev_id); // create nvjpeg handle and stream - PADDLE_ENFORCE_NVJPEG_SUCCESS( - platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_HYBRID, &device_allocator_, - &pinned_allocator_, 0, &handle_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegCreateEx( + NVJPEG_BACKEND_HYBRID, &device_allocator_, &pinned_allocator_, 0, + &handle_)); // set pinned/device memory padding if (host_memory_padding > 0) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetPinnedMemoryPadding(host_memory_padding, handle_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegSetPinnedMemoryPadding(host_memory_padding, + handle_)); } if (device_memory_padding > 0) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetDeviceMemoryPadding(device_memory_padding, handle_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegSetDeviceMemoryPadding(device_memory_padding, + handle_)); } // create nvjpeg stream for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i])); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate( + handle_, &nvjpeg_streams_[i])); } // create decode params, decoder and state - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate( + handle_, NVJPEG_BACKEND_HYBRID, &decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_)); // create device & pinned buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(handle_, &device_allocator_, &device_buffer_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate( + handle_, &device_allocator_, &device_buffer_)); for (size_t i = 0; i < pinned_buffers_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(handle_, &pinned_allocator_, &pinned_buffers_[i])); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate( + handle_, &pinned_allocator_, &pinned_buffers_[i])); } } ImageDecoder::~ImageDecoder() { // destroy nvjpeg streams for (size_t i = 0; i < nvjpeg_streams_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i])); } // destroy decode params, decoder and state - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsDestroy(decode_params_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderDestroy(decoder_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStateDestroy(state_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecodeParamsDestroy(decode_params_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecoderDestroy(decoder_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegJpegStateDestroy(state_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_)); for (size_t i = 0; i < pinned_buffers_.size(); i++) { - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i])); } // destroy nvjpeg handle at last PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_)); } -void ImageDecoder::CPUDecodeRandomCrop( - const uint8_t* data, size_t length, - RandomROIGenerator* roi_generator, - unsigned char* workspace, size_t workspace_size, - framework::LoDTensor* out, platform::Place place) { +void ImageDecoder::CPUDecodeRandomCrop(const uint8_t* data, size_t length, + RandomROIGenerator* roi_generator, + unsigned char* workspace, + size_t workspace_size, + framework::LoDTensor* out, + platform::Place place) { VLOG(4) << "CPUDecodeRandomCropResize enter"; #ifdef PADDLE_WITH_OPENCV - cv::Mat image = - cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast(data)), cv::IMREAD_COLOR); - + cv::Mat image = cv::imdecode( + cv::Mat(1, length, CV_8UC1, const_cast(data)), + cv::IMREAD_COLOR); + cv::Mat cropped; int height = image.rows; int width = image.cols; @@ -112,7 +127,8 @@ void ImageDecoder::CPUDecodeRandomCrop( cpu_tensor.Resize(phi::make_ddim(out_shape)); auto* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - cv::Mat cpu_mat(height, width, CV_8UC3, const_cast(cpu_data), cv::Mat::AUTO_STEP); + cv::Mat cpu_mat(height, width, CV_8UC3, const_cast(cpu_data), + cv::Mat::AUTO_STEP); cv::cvtColor(cropped, cpu_mat, cv::COLOR_BGR2RGB); // copy cpu tensor to output gpu tensor @@ -132,9 +148,8 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( int widths[NVJPEG_MAX_COMPONENT]; int heights[NVJPEG_MAX_COMPONENT]; - - nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len, - &components, &subsampling, widths, heights); + nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo( + handle_, bit_stream, bit_len, &components, &subsampling, widths, heights); if (status != NVJPEG_STATUS_SUCCESS) return status; @@ -153,8 +168,8 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( - "The provided mode(%s) does not support components(%d)", - mode_, components)); + "The provided mode(%s) does not support components(%d)", mode_, + components)); } } else if (mode_ == "gray") { output_format = NVJPEG_OUTPUT_Y; @@ -164,16 +179,20 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( output_components = 3; } else { PADDLE_THROW(platform::errors::Fatal( - "The provided mode is not supported for JPEG files on GPU: %s!", mode_)); + "The provided mode is not supported for JPEG files on GPU: %s!", + mode_)); } - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, output_format)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, + output_format)); if (roi_generator) { ROI roi; roi_generator->GenerateRandomROI(width, height, &roi); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h)); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI( + decode_params_, roi.x, roi.y, roi.w, roi.h)); height = roi.h; width = roi.w; } @@ -189,56 +208,73 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams( return NVJPEG_STATUS_SUCCESS; } -nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) { +nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream, + size_t bit_len, + nvjpegImage_t* out_image) { auto buffer = pinned_buffers_[page_id_]; auto stream = nvjpeg_streams_[page_id_]; page_id_ ^= 1; // decode jpeg in host to pinned buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); - nvjpegStatus_t status = platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream); + PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse( + handle_, bit_stream, bit_len, false, false, stream)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer)); + nvjpegStatus_t status = platform::dynload::nvjpegDecodeJpegHost( + handle_, decoder_, state_, decode_params_, stream); if (status != NVJPEG_STATUS_SUCCESS) return status; // transfer and decode to device buffer - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); - PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_)); - status = platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, nullptr); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_)); + PADDLE_ENFORCE_NVJPEG_SUCCESS( + platform::dynload::nvjpegDecodeJpegTransferToDevice( + handle_, decoder_, state_, stream, cuda_stream_)); + status = platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, + out_image, nullptr); return status; } -void ImageDecoder::Run( - const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, - RandomROIGenerator* roi_generator, platform::Place& place) { +void ImageDecoder::Run(const uint8_t* bit_stream, size_t bit_len, + framework::LoDTensor* out, + RandomROIGenerator* roi_generator, + platform::Place& place) { nvjpegImage_t image; - nvjpegStatus_t status = ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); + + nvjpegStatus_t status = + ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place); if (status != NVJPEG_STATUS_SUCCESS) { - CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place); + CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, + place); return; } + status = GPUDecodeRandomCrop(bit_stream, bit_len, &image); if (status != NVJPEG_STATUS_SUCCESS) { - CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place); + CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, + place); } } ImageDecoderThreadPool::ImageDecoderThreadPool( const int num_threads, const std::string mode, const int dev_id, const size_t host_memory_padding, const size_t device_memory_padding) - : threads_(num_threads), - mode_(mode), - dev_id_(dev_id), - shutdown_(false), - running_(false), - completed_(false), - outstand_tasks_(0) { - PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument( - "num_threads shoule be a positive interger, " - "but got %d", num_threads)); + : threads_(num_threads), + mode_(mode), + dev_id_(dev_id), + shutdown_(false), + running_(false), + completed_(false), + outstand_tasks_(0) { + PADDLE_ENFORCE_GT(num_threads, 0, + platform::errors::InvalidArgument( + "num_threads shoule be a positive interger, " + "but got %d", + num_threads)); for (int i = 0; i < num_threads; i++) { threads_.emplace_back( - std::thread(std::bind(&ImageDecoderThreadPool::ThreadLoop, - this, i, host_memory_padding, device_memory_padding))); + std::thread(std::bind(&ImageDecoderThreadPool::ThreadLoop, this, i, + host_memory_padding, device_memory_padding))); } } @@ -278,29 +314,29 @@ void ImageDecoderThreadPool::ShutDown() { task_queue_.clear(); for (auto& thread : threads_) { - if (thread.joinable()) thread.join(); + if (thread.joinable()) thread.join(); } } void ImageDecoderThreadPool::SortTaskByLengthDescend() { std::lock_guard lock(mutex_); std::sort(task_queue_.begin(), task_queue_.end(), - [](const std::shared_ptr a, - const std::shared_ptr b) { - return b->bit_len < a->bit_len; - }); + [](const std::shared_ptr a, + const std::shared_ptr b) { + return b->bit_len < a->bit_len; + }); } -void ImageDecoderThreadPool::ThreadLoop( - const int thread_idx, const size_t host_memory_padding, - const size_t device_memory_padding) { - ImageDecoder* decoder = new ImageDecoder(mode_, dev_id_, - host_memory_padding, +void ImageDecoderThreadPool::ThreadLoop(const int thread_idx, + const size_t host_memory_padding, + const size_t device_memory_padding) { + ImageDecoder* decoder = new ImageDecoder(mode_, dev_id_, host_memory_padding, device_memory_padding); - while (!shutdown_) { std::unique_lock lock(mutex_); - running_cond_.wait(lock, [this] { return (running_ && !task_queue_.empty()) || shutdown_; }); + running_cond_.wait(lock, [this] { + return (running_ && !task_queue_.empty()) || shutdown_; + }); if (shutdown_) break; auto task = task_queue_.front(); @@ -322,7 +358,8 @@ void ImageDecoderThreadPool::ThreadLoop( } // initialization static variables out of ImageDecoderThreadPoolManager -ImageDecoderThreadPoolManager* ImageDecoderThreadPoolManager::pm_instance_ptr_ = nullptr; +ImageDecoderThreadPoolManager* ImageDecoderThreadPoolManager::pm_instance_ptr_ = + nullptr; std::mutex ImageDecoderThreadPoolManager::m_; } // namespace data diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index adfb0a8f23e07b..5658852f1836f0 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -21,7 +21,6 @@ from ...fluid.framework import in_dygraph_mode from ...common_ops_import import * - __all__ = ["map", "data_reader"] @@ -62,9 +61,26 @@ def _generate_stream_id(): def map(map_func, inputs=[]): + def _build_program_inputs(x, map_block): + assert isinstance(x, (list, tuple)) + assert len(x) > 0, "map function must have inputs" + outputs = [] + if isinstance(x[0], (list, tuple)): + for item in x: + outputs.append(_build_program_inputs(item, map_block)) + else: + for item in x: + outputs.append( + map_block.create_var( + name=unique_name.generate("map_sub"), + type=item.desc.type(), + dtype=item.desc.dtype(), + persistable=False)) + return outputs + inputs = _to_list(inputs) if in_dygraph_mode(): - return map_func(*inputs) + return map_func(inputs) helper = LayerHelper("map", **locals()) @@ -74,16 +90,19 @@ def map(map_func, inputs=[]): program_id = _hash_with_id(main_program, map_func) map_block = main_program.current_block() - program_inputs = [ - map_block.create_var( - name=unique_name.generate("map_sub"), - type=inp.desc.type(), - dtype=inp.desc.dtype(), - persistable=False) for inp in inputs] + program_inputs = _build_program_inputs(inputs, map_block) + program_outputs = map_func(*program_inputs) program_outputs = _to_list(program_outputs) - - input_var_names = [v.name for v in program_inputs] + input_var_names = [] + for variables in program_inputs: + if isinstance(variables, (list, tuple)): + inputs = inputs[0] + for v in variables: + input_var_names.append(v.name) + else: + input_var_names.append(variables.name) + output_var_names = [v.name for v in program_outputs] outputs = \ @@ -128,22 +147,39 @@ def data_reader(reader_func, reader_block = main_program.current_block() indices_var = reader_block.create_var( - name=unique_name.generate("data_reader_sub"), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype="int64", - persistable=False) + name=unique_name.generate("data_reader_sub"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype="int64", + persistable=False) program_outputs = reader_func(indices_var) program_outputs = _to_list(program_outputs) - - indices_var_name = indices_var.name - output_var_names = [v.name for v in program_outputs] - outputs = \ - [helper.create_variable( - name=unique_name.generate("data_reader"), - type=outp.desc.type(), - dtype=outp.desc.dtype(), - persistable=True) for outp in program_outputs] + indices_var_name = indices_var.name + output_var_names = [] + for outs in program_outputs: + if isinstance(outs, (list, tuple)): + for out in outs: + output_var_names.append(out.name) + else: + output_var_names.append(outs.name) + + outputs = [] + for outps in program_outputs: + if isinstance(outps, (list, tuple)): + for outp in outps: + outputs.append( + helper.create_variable( + name=unique_name.generate("data_reader"), + type=outp.desc.type(), + dtype=outp.desc.dtype(), + persistable=True)) + else: + outputs.append( + helper.create_variable( + name=unique_name.generate("data_reader"), + type=outps.desc.type(), + dtype=outps.desc.dtype(), + persistable=True)) attrs = { "reader_id": _hash_with_id(main_program), @@ -160,9 +196,6 @@ def data_reader(reader_func, } helper.append_op( - type="data_reader", - inputs={}, - outputs={"Out": outputs}, - attrs=attrs) + type="data_reader", inputs={}, outputs={"Out": outputs}, attrs=attrs) return outputs diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 1fd6b75e712959..c241c41fe6c818 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -868,8 +868,11 @@ def read_file(filename, name=None): return out -def image_decode(x, mode='unchanged', num_threads=2, - host_memory_padding=0, device_memory_padding=0, +def image_decode(x, + mode='unchanged', + num_threads=2, + host_memory_padding=0, + device_memory_padding=0, name=None): """ Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. @@ -912,18 +915,19 @@ def image_decode(x, mode='unchanged', num_threads=2, core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode( - x, out, "mode", mode, "num_threads", num_threads, - "local_rank", local_rank, "program_id", program_id, - "host_memory_padding", host_memory_padding, - "device_memory_padding", device_memory_padding) + x, out, "mode", mode, "num_threads", num_threads, "local_rank", + local_rank, "program_id", program_id, "host_memory_padding", + host_memory_padding, "device_memory_padding", device_memory_padding) inputs = {'X': x} - attrs = {"mode": mode, - "num_threads": num_threads, - "local_rank": local_rank, - "program_id": utils._hash_with_id(default_main_program()), - "host_memory_padding": host_memory_padding, - "device_memory_padding": device_memory_padding} + attrs = { + "mode": mode, + "num_threads": num_threads, + "local_rank": local_rank, + "program_id": utils._hash_with_id(default_main_program()), + "host_memory_padding": host_memory_padding, + "device_memory_padding": device_memory_padding + } helper = LayerHelper("batch_decode", **locals()) out = helper.create_variable( @@ -942,8 +946,8 @@ def image_decode_random_crop(x, host_memory_padding=0, device_memory_padding=0, data_layout='NCHW', - aspect_ratio_min=3./4., - aspect_ratio_max=4./3., + aspect_ratio_min=3. / 4., + aspect_ratio_max=4. / 3., area_min=0.08, area_max=1., num_attempts=10, @@ -988,40 +992,49 @@ def image_decode_random_crop(x, local_rank = paddle.distributed.get_rank() if in_dygraph_mode(): out = core.VarBase(core.VarDesc.VarType.UINT8, [], - unique_name.generate("image_decode_random_crop"), - core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) + unique_name.generate("image_decode_random_crop"), + core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) program_id = utils._hash_with_id(mode, num_threads, name, local_rank) return _C_ops.batch_decode_random_crop( - x, out, "mode", mode, "num_threads", num_threads, - "data_layout", data_layout, "aspect_ratio_min", - aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, - "area_min", area_min, "area_max", area_max, - "num_attempts", num_attempts, "local_rank", local_rank, - "program_id", program_id, - "host_memory_padding", host_memory_padding, - "device_memory_padding", device_memory_padding) + x, out, "mode", mode, "num_threads", num_threads, "data_layout", + data_layout, "aspect_ratio_min", aspect_ratio_min, + "aspect_ratio_max", aspect_ratio_max, "area_min", area_min, + "area_max", area_max, "num_attempts", num_attempts, "local_rank", + local_rank, "program_id", program_id, "host_memory_padding", + host_memory_padding, "device_memory_padding", device_memory_padding) inputs = {'X': x} - attrs = {"mode": mode, - "num_threads": num_threads, - "host_memory_padding": host_memory_padding, - "device_memory_padding": device_memory_padding, - "data_layout": data_layout, - "aspect_ratio_min": aspect_ratio_min, - "aspect_ratio_max": aspect_ratio_max, - "area_min": area_min, - "area_max": area_max, - "num_attempts": num_attempts, - "local_rank": local_rank, - "program_id": utils._hash_with_id(default_main_program())} + attrs = { + "mode": mode, + "num_threads": num_threads, + "host_memory_padding": host_memory_padding, + "device_memory_padding": device_memory_padding, + "data_layout": data_layout, + "aspect_ratio_min": aspect_ratio_min, + "aspect_ratio_max": aspect_ratio_max, + "area_min": area_min, + "area_max": area_max, + "num_attempts": num_attempts, + "local_rank": local_rank, + "program_id": utils._hash_with_id(default_main_program()) + } helper = LayerHelper("batch_decode_random_crop", **locals()) - out = helper.create_variable( - name=unique_name.generate("image_decode_random_crop"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype=x.dtype) + # out = helper.create_variable( + # name=unique_name.generate("image_decode_random_crop"), + # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + # dtype=x.dtype) + out = [ + helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='uint8') for i in range(len(x)) + ] helper.append_op( - type="batch_decode_random_crop", inputs=inputs, attrs=attrs, outputs={"Out": out}) + type="batch_decode_random_crop", + inputs=inputs, + attrs=attrs, + outputs={"Out": out}) return out @@ -1030,12 +1043,12 @@ def random_flip(x, prob=0.5, name=None): if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") - rand_vec = layers.uniform_random_batch_size_like( - x, [1, 1], min=0., max=1.) + rand_vec = layers.uniform_random_batch_size_like(x, [1, 1], min=0., max=1.) return rand_vec < prob -def mirror_normalize(x, mirror, +def mirror_normalize(x, + mirror, mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375], name=None): @@ -1054,17 +1067,18 @@ def _to_list_3(l): std = _to_list_3(std) if _non_static_mode(): - return _C_ops.mirror_normalize(x, mirror, "mean", mean, - "std", std) + return _C_ops.mirror_normalize(x, mirror, "mean", mean, "std", std) helper = LayerHelper("mirror_normalize", **locals()) dtype = helper.input_dtype() out = helper.create_variable_for_type_inference(dtype) helper.append_op( type="mirror_normalize", - inputs={"X": x, "Mirror": mirror}, + inputs={"X": x, + "Mirror": mirror}, outputs={"Out": out}, - attrs={"mean": mean, "std": std}) + attrs={"mean": mean, + "std": std}) return out @@ -1508,8 +1522,8 @@ def forward(self, x, boxes, boxes_num, aligned=True): def random_crop_and_resize(x, size, - aspect_ratio_min=3./4., - aspect_ratio_max=4./3., + aspect_ratio_min=3. / 4., + aspect_ratio_max=4. / 3., area_min=0.08, area_max=1., num_attempts=10, @@ -1574,10 +1588,9 @@ def random_crop_and_resize(x, out = _C_ops.batch_random_crop_and_resize( x, "size", size, "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max, "area_max", area_max, - "area_min", area_min, "num_attempts", num_attempts, - "interp_method", interp_method, "align_corners", - align_corners, "align_mode", align_mode, - "data_format", data_format, "seed", seed) + "area_min", area_min, "num_attempts", num_attempts, "interp_method", + interp_method, "align_corners", align_corners, "align_mode", + align_mode, "data_format", data_format, "seed", seed) return out helper = LayerHelper('batch_random_crop_and_resize', **locals()) @@ -1662,10 +1675,10 @@ def image_resize(x, size = (size, size) if in_dygraph_mode(): - out = _C_ops.batch_resize( - x, "size", size, "interp_method", interp_method, - "align_corners", align_corners, "align_mode", - align_mode, "data_format", data_format, "seed", seed) + out = _C_ops.batch_resize(x, "size", size, "interp_method", + interp_method, "align_corners", align_corners, + "align_mode", align_mode, "data_format", + data_format, "seed", seed) return out helper = LayerHelper('batch_resize', **locals()) @@ -1681,10 +1694,7 @@ def image_resize(x, "seed": seed, } helper.append_op( - type="batch_resize", - inputs=inputs, - outputs={"Out": out}, - attrs=attrs) + type="batch_resize", inputs=inputs, outputs={"Out": out}, attrs=attrs) return out diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 061ccb9049a21d..7c8c22e2f676b0 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -22,15 +22,14 @@ from paddle.common_ops_import import * from paddle import _C_ops -__all__ = [ #noqa +__all__ = [ #noqa 'file_label_loader', 'file_label_reader', ] class _Sampler(object): - def __init__(self, batch_size, num_samples, - shuffle=False, drop_last=False): + def __init__(self, batch_size, num_samples, shuffle=False, drop_last=False): self.batch_size = batch_size self.num_samples = num_samples self.shuffle = shuffle @@ -49,7 +48,7 @@ def __next__(self): batch_len = min(self.batch_size, self.num_samples - self.start_idx) indices = self.sample_ids[self.start_idx:self.start_idx + batch_len] self.start_idx += batch_len - + if self.drop_last and len(indices) < self.batch_size: self.reset() return self.__next__() @@ -66,13 +65,16 @@ class _SamplerManager(object): def __init__(self): self.samplers = {} - def get(self, sample_id, batch_size, num_samples, - shuffle=False, drop_last=False): + def get(self, + sample_id, + batch_size, + num_samples, + shuffle=False, + drop_last=False): if sample_id in self.samplers: return self.samplers[sample_id] - sampler = _Sampler(batch_size, num_samples, - shuffle, drop_last) + sampler = _Sampler(batch_size, num_samples, shuffle, drop_last) self.samplers[sample_id] = sampler return sampler @@ -80,7 +82,7 @@ def get(self, sample_id, batch_size, num_samples, _sampler_manager = _SamplerManager() -def file_label_loader(data_root, indices, name=None): +def file_label_loader(data_root, indices, batch_size, name=None): """ Reads a batch of data, outputs the bytes contents of a file as a uint8 Tensor with one dimension. @@ -92,29 +94,27 @@ def file_label_loader(data_root, indices, name=None): need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. """ - # from paddle.vision.datasets import DatasetFolder - # data_folder = DatasetFolder(data_root) - # samples = [s[0] for s in data_folder.samples] - # targets = [s[1] for s in data_folder.samples] if in_dygraph_mode(): - image = core.VarBase(core.VarDesc.VarType.UINT8, [], - unique_name.generate("file_label_loader"), - core.VarDesc.VarType.LOD_TENSOR_ARRAY, False) - return _C_ops.file_label_loader(indices, image, 'data_root', - data_root) + image = [ + core.VarBase(core.VarDesc.VarType.UINT8, [], + unique_name.generate("file_label_loader"), + core.VarDesc.VarType.LOD_TENSOR, False) + for i in range(batch_size) + ] + return _C_ops.file_label_loader(indices, image, 'data_root', data_root) inputs = {"Indices": indices} - attrs = { - 'data_root': data_root, - } + attrs = {'data_root': data_root, } helper = LayerHelper("file_label_loader", **locals()) - image = helper.create_variable( - name=unique_name.generate("file_label_loader"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype='uint8') - + image = [ + helper.create_variable( + name=unique_name.generate("file_label_loader"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='uint8') for i in range(batch_size) + ] + label = helper.create_variable( name=unique_name.generate("file_label_loader"), type=core.VarDesc.VarType.LOD_TENSOR, @@ -163,23 +163,25 @@ def file_label_reader(file_root, targets = [s[1] for s in data_folder.samples] if in_dygraph_mode(): - sample_id = utils._hash_with_id(file_root, batch_size, - shuffle, drop_last) + sample_id = utils._hash_with_id(file_root, batch_size, shuffle, + drop_last) sampler = _sampler_manager.get(sample_id, batch_size=batch_size, num_samples=len(samples), shuffle=shuffle, drop_last=drop_last) indices = paddle.to_tensor(next(sampler), dtype='int64') - return file_label_loader(file_root, indices) + outs = file_label_loader(file_root, indices, batch_size) + return outs[:-1], outs[-1] def _reader(indices): - return file_label_loader(file_root, indices) - - return paddle.io.data_reader(_reader, - batch_size=batch_size, - num_samples=len(samples), - shuffle=shuffle, - drop_last=drop_last, - seed=seed) - + return file_label_loader(file_root, indices, batch_size) + + outs = paddle.io.data_reader( + _reader, + batch_size=batch_size, + num_samples=len(samples), + shuffle=shuffle, + drop_last=drop_last, + seed=seed) + return outs[:-1], outs[-1] From 59d024227532442c2bd99d78258d389f42f61dde Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 1 Apr 2022 16:16:19 +0000 Subject: [PATCH 83/95] refine map API. test=develop --- .../fluid/operators/data/batch_decode_op.cc | 2 +- paddle/fluid/operators/data/map_runner.cc | 4 +- python/paddle/fluid/dataloader/ops.py | 104 +++++++++++------- 3 files changed, 68 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index d0f8dae02e5d17..bb636ed1d267d4 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -60,7 +60,7 @@ class BatchDecodeOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "A one dimensional uint8 tensor containing the raw bytes " "of the JPEG image. It is a tensor with rank 1."); - AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddOutput("Out", "The output tensor of DecodeJpeg op").AsDuplicable(); AddComment(R"DOC( This operator decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index e4e211b5f45506..433964e50de03e 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -44,13 +44,13 @@ MapRunner::MapRunner( "input_var_names length should be equal to input_queues length, " "but recieve %d != %d.", input_var_names_.size(), - input_var_names_.size())); + input_queues_.size())); PADDLE_ENFORCE_EQ(output_var_names_.size(), output_queues_.size(), platform::errors::InvalidArgument( "output_var_names length should be equal to output_queues length, " "but recieve %d != %d.", output_var_names_.size(), - output_var_names_.size())); + output_queues_.size())); StartMapThread(scope); } diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 5658852f1836f0..67ac5d4be148b5 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -21,6 +21,8 @@ from ...fluid.framework import in_dygraph_mode from ...common_ops_import import * +from collections.abc import Sequence, Mapping + __all__ = ["map", "data_reader"] @@ -60,57 +62,81 @@ def _generate_stream_id(): return _stream_id_generator.get_stream_id() -def map(map_func, inputs=[]): - def _build_program_inputs(x, map_block): - assert isinstance(x, (list, tuple)) - assert len(x) > 0, "map function must have inputs" - outputs = [] - if isinstance(x[0], (list, tuple)): - for item in x: - outputs.append(_build_program_inputs(item, map_block)) - else: - for item in x: - outputs.append( - map_block.create_var( - name=unique_name.generate("map_sub"), - type=item.desc.type(), - dtype=item.desc.dtype(), - persistable=False)) - return outputs - - inputs = _to_list(inputs) +def map(map_func, *args, **kwargs): if in_dygraph_mode(): return map_func(inputs) helper = LayerHelper("map", **locals()) + # NOTE: map_func can take List(Tensor) (while batch_size > 1) as + # inputs or outputs, which means we need to keep the structure + # info when calling map_func, _build_program_inputs used to + # generate 3 kinds of infos: + # 1. return value: holds variables in map_block, and keeps the + # structure info of map inputs, will be used to call map_func + # 2. input_vars: holds variables in map_block in flatten format, + # will be used to generate input_var_names + # 3. flat_inputs: holds variables in main_program/global_block in + # flatten format, will be used as inputs for appendding map OP + # and _parse_program_outputs follows similar logic + def _build_program_inputs(inputs, map_block, + input_vars=[], flat_inputs=[]): + if isinstance(inputs, Sequence): + return [_build_program_inputs(inp, map_block, input_vars, + flat_inputs) for inp in inputs] + elif isinstance(inputs, Mapping): + return {k: _build_program_inputs(v, map_block, input_vars, + flat_inputs) for k,v in inputs.items()} + else: + var = map_block.create_var( + name=unique_name.generate("map_sub"), + type=inputs.desc.type(), + dtype=inputs.desc.dtype(), + persistable=False) + input_vars.append(var) + flat_inputs.append(inputs) + return var + + def _parse_program_outputs(outputs, output_vars=[], flat_outputs=[]): + if isinstance(outputs, Sequence): + return [_parse_program_outputs(outp, output_vars, + flat_outputs) for outp in outputs] + elif isinstance(outputs, Mapping): + return {k: _parse_program_outputs(v, output_vars, + flat_outputs) for outp in outputs} + else: + var = helper.create_variable( + name=unique_name.generate("map"), + type=outputs.desc.type(), + dtype=outputs.desc.dtype(), + persistable=True) + flat_outputs.append(var) + output_vars.append(outputs) + return var + # build map block main_program = helper.main_program with _ProgramGuard(main_program): program_id = _hash_with_id(main_program, map_func) map_block = main_program.current_block() - program_inputs = _build_program_inputs(inputs, map_block) + input_vars, flat_inputs = [], [] + program_inputs_args = _build_program_inputs( + args, map_block, input_vars, flat_inputs) + program_inputs_kwargs = _build_program_inputs( + kwargs, map_block, input_vars, flat_inputs) - program_outputs = map_func(*program_inputs) - program_outputs = _to_list(program_outputs) - input_var_names = [] - for variables in program_inputs: - if isinstance(variables, (list, tuple)): - inputs = inputs[0] - for v in variables: - input_var_names.append(v.name) - else: - input_var_names.append(variables.name) + program_outputs = map_func(*program_inputs_args, + **program_inputs_kwargs) - output_var_names = [v.name for v in program_outputs] + # NOTE: _parse_program_outputs create main_program variables, so + # we need to call it outside of _ProgramGuard + output_vars, flat_outputs = [], [] + outputs = _parse_program_outputs(program_outputs, output_vars, + flat_outputs) + input_var_names = [v.name for v in input_vars] + output_var_names = [v.name for v in output_vars] - outputs = \ - [helper.create_variable( - name=unique_name.generate("map"), - type=outp.desc.type(), - dtype=outp.desc.dtype(), - persistable=True) for outp in program_outputs] attrs = { "map_block": map_block, "program_id": program_id, @@ -124,8 +150,8 @@ def _build_program_inputs(x, map_block): helper.append_op( type="map", - inputs={"In": inputs}, - outputs={"Out": outputs}, + inputs={"In": flat_inputs}, + outputs={"Out": flat_outputs}, attrs=attrs) return outputs From 52771e6c7661ebbaf83fb10637544b930e938c8d Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 2 Apr 2022 02:51:14 +0000 Subject: [PATCH 84/95] fix ci compile. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index d2b28e8e5710e8..5c299caaa14322 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -13,7 +13,7 @@ cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpoo op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) -cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS} ${OpenCV_LIBS}) +cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) From 353f759461d39b0dd7743f4ebe96cda9cef0f39f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 2 Apr 2022 06:45:43 +0000 Subject: [PATCH 85/95] fix ci compile. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 5c299caaa14322..5e242b343bfcdc 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -1,4 +1,5 @@ include(operators) + if(WITH_UNITY_BUILD) # Load Unity Build rules for operators in paddle/fluid/operators/data/ include(unity_build_rule.cmake) @@ -12,19 +13,16 @@ op_library(data_reader_op SRCS data_reader_op.cc DEPS ${OP_HEADER_DEPS}) cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) -cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) -cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) -op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) -op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) - -op_library(batch_random_crop_and_resize_op SRCS batch_random_crop_and_resize_op.cc batch_random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) -op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) - op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) -op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.cu DEPS ${OP_HEADER_DEPS}) +cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) +cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) -# register_operators() +if (WITH_GPU) + op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) + op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) -# TODO: add test here -# cc_test(xxx SRCS xxx DEPS xxx + op_library(batch_random_crop_and_resize_op SRCS batch_random_crop_and_resize_op.cc batch_random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) + op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) + op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.cu DEPS ${OP_HEADER_DEPS}) +endif() From ce4610bdae1fb471c3499649fe444c1c41d7d10f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 2 Apr 2022 07:16:41 +0000 Subject: [PATCH 86/95] fix ci compile. test=develop --- paddle/fluid/operators/data/image_decoder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc index 37fec0433f9ca4..050016c11d7ce5 100644 --- a/paddle/fluid/operators/data/image_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -210,7 +210,7 @@ nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream, void ImageDecoder::Run(const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, RandomROIGenerator* roi_generator, - platform::Place& place) { + const platform::Place& place) { nvjpegImage_t image; nvjpegStatus_t status = From a08e487ffeaa319805b01d6db792aa9eb45f7433 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 2 Apr 2022 08:00:46 +0000 Subject: [PATCH 87/95] add test_data_pipeline. test=develop --- paddle/fluid/operators/data/data_reader_op.cc | 7 - paddle/fluid/operators/data/data_reader_op.h | 4 +- paddle/fluid/operators/data/dataloader_op.h | 4 +- paddle/fluid/operators/data/map_runner.cc | 13 +- paddle/fluid/operators/data/map_runner.h | 4 +- paddle/fluid/operators/data/utils.h | 12 +- paddle/fluid/pybind/pybind.cc | 3 - python/paddle/fluid/core.py | 2 - python/paddle/fluid/dataloader/pipeline.py | 15 +- python/paddle/tests/test_data_pipeline.py | 168 ++++++++++++++++++ 10 files changed, 187 insertions(+), 45 deletions(-) create mode 100644 python/paddle/tests/test_data_pipeline.py diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 7a287a137a9ae5..45908a95b475a1 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -34,13 +34,6 @@ class DataReaderOp : public framework::OperatorBase { OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "DataReaderOp"); } -// protected: -// framework::OpKernelType GetExpectedKernelType( -// const framework::ExecutionContext& ctx) const { -// return framework::OpKernelType(framework::proto::VarType::FP32, -// ctx.GetPlace()); -// } -// private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 61831cd8b2d61c..7691ee2376fae6 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -311,7 +311,7 @@ class ReaderManager { void ShutDownReader(const int64_t reader_id) { auto iter = id_to_reader_.find(reader_id); if (iter != id_to_reader_.end()) { - iter->second->ShutDown(); + if (iter->second.get()) iter->second.get()->ShutDown(); id_to_reader_.erase(reader_id); } } @@ -327,7 +327,7 @@ class ReaderManager { auto iter = id_to_reader_.begin(); while (iter != id_to_reader_.end()){ if(iter->second.get()){ - iter->second->ShutDown(); + iter->second.get()->ShutDown(); } iter++; } diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index dc227f43cfcaaa..ff64c38a55b134 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -11,6 +11,7 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/data/pipeline.h" namespace paddle { @@ -39,8 +40,7 @@ class DataLoaderOpKernel : public framework::OpKernel { if (!pipeline->IsRunning()) { VLOG(4) << "DataLoaderOpKernel Pipeline not running, throw EOF"; - throw platform::EOFException("DataLoaderOpKernel epoch end", - __FILE__, __LINE__); + PADDLE_THROW_EOF(); } } }; diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 433964e50de03e..4e043b60e26f96 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -27,8 +27,7 @@ MapRunner::MapRunner( const std::vector &output_var_names, const std::vector> input_queues, const std::vector> output_queues) - : thread_pool_(1), - running_(true), + : running_(true), shutdown_(false), map_block_(map_block), program_id_(program_id), @@ -109,7 +108,7 @@ void signal_handler(int sig_num) { } void MapRunner::StartMapThread(const Scope* scope) { - thread_pool_.enqueue([this, scope]() -> void { + map_thread_ = std::thread([this, scope]() -> void { // MapThread may crash with SIGSEGV singal in Executor::Prepare // when Python program break and exit, catch SIGSEGV singal and // exit thread silently @@ -197,13 +196,15 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, void MapRunner::ShutDown() { VLOG(1) << "MapRunner shutdown " << program_id_; // close all output queue, op after this op can shutdown itself + for (auto queue : output_queues_) { + if(queue && !queue->IsClosed()) queue->Close(); + } + shutdown_ = true; running_ = false; running_cond_.notify_all(); - for (auto queue : output_queues_) { - if(queue && !queue->IsClosed()) queue->Close(); - } + if (map_thread_.joinable()) map_thread_.join(); } void MapRunner::Reset() { diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index fe951ae84256e3..b110e67d769b0d 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -72,7 +72,7 @@ class MapRunner { void CheckInputVarStatus(const Variable &var, const std::string &var_name); void CheckOutputVarStatus(const Variable &var, const std::string &var_name); - ThreadPool thread_pool_; + std::thread map_thread_; bool running_; std::condition_variable running_cond_; bool shutdown_; @@ -129,7 +129,7 @@ class MapRunnerManager { std::lock_guard lk(m_); auto iter = prog_id_to_runner_.find(program_id); if (iter != prog_id_to_runner_.end()) { - iter->second.get()->ShutDown(); + if(iter->second.get()) iter->second.get()->ShutDown(); prog_id_to_runner_.erase(iter); } } diff --git a/paddle/fluid/operators/data/utils.h b/paddle/fluid/operators/data/utils.h index b0ff265820c262..ed7db01d68e265 100644 --- a/paddle/fluid/operators/data/utils.h +++ b/paddle/fluid/operators/data/utils.h @@ -35,9 +35,9 @@ void ShutDownAllDataLoaders() { // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - - // step 3: shutdown Pipeline - PipelineManager::Instance()->ShutDown(); + + // // step 3: shutdown Pipeline + // PipelineManager::Instance()->ShutDown(); VLOG(4) << "ShutDownAllDataLoaders Pipeline shutdown finish"; } @@ -49,12 +49,6 @@ void ShutDownReadersAndDecoders(const int64_t program_id) { ImageDecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); } -void ShutDownMaps(const std::vector program_ids) { - for (auto& program_id : program_ids) { - MapRunnerManager::Instance()->ShutDownMapRunner(program_id); - } -} - void ShutDownPipeline(const int64_t program_id) { PipelineManager::Instance()->ShutDownPipeline(program_id); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 4104330400c378..f13d28b47c5e57 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -772,9 +772,6 @@ PYBIND11_MODULE(core_noavx, m) { &paddle::operators::data::ShutDownAllDataLoaders); m.def("_shutdown_readers_and_decoders", &paddle::operators::data::ShutDownReadersAndDecoders); - m.def("_shutdown_maps", [](const std::vector program_ids) { - paddle::operators::data::ShutDownMaps(program_ids); - }); m.def("_shutdown_pipeline", &paddle::operators::data::ShutDownPipeline); m.def("_reset_dataloader", [](const int64_t reader_id, diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index e5f27b094fb8f2..064aa7d175af23 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -283,7 +283,6 @@ def to_list(s): from .core_avx import _set_current_stream from .core_avx import _shutdown_all_dataloaders from .core_avx import _shutdown_readers_and_decoders - from .core_avx import _shutdown_maps from .core_avx import _shutdown_pipeline from .core_avx import _reset_dataloader if sys.platform != 'win32': @@ -344,7 +343,6 @@ def to_list(s): from .core_noavx import _set_current_stream from .core_noavx import _shutdown_all_dataloaders from .core_noavx import _shutdown_readers_and_decoders - from .core_noavx import _shutdown_maps from .core_noavx import _shutdown_pipeline from .core_noavx import _reset_dataloder from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 75ac0faa6dc882..0f38163ea558d4 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -55,7 +55,6 @@ def __init__(self, queue_depth=2): def _init_programs(self): self._main_program = fluid.Program() - self._startup_program = fluid.Program() self._out_vars = [] self._out_names = [] self._is_built = False @@ -64,14 +63,10 @@ def __enter__(self): # switch main and startup program paddle.enable_static() self._main_program = framework.switch_main_program(self._main_program) - self._startup_program = framework.switch_startup_program( - self._startup_program) return self def __exit__(self, exception_type, exception_value, traceback): self._main_program = framework.switch_main_program(self._main_program) - self._startup_program = framework.switch_startup_program( - self._startup_program) local_rank = paddle.distributed.get_rank() paddle.disable_static("gpu:" + str(local_rank)) @@ -133,6 +128,8 @@ def __next__(self): try: _C_ops.dataloader(self._output_vars, *self._attrs) + except KeyboardInterrupt: + pass except: raise StopIteration @@ -159,14 +156,8 @@ def shutdown(self): try: program_id = _hash_with_id(self._main_program) core._shutdown_readers_and_decoders(program_id) - - map_program_ids = [] - for op in self._main_program.block(0).ops: - if op.type == "map" and op.has_attr('program_id'): - map_program_ids.append(op.attr('program_id')) - core._shutdown_maps(map_program_ids) - core._shutdown_pipeline(program_id) + del self._main_program finally: self.is_shutdown = True diff --git a/python/paddle/tests/test_data_pipeline.py b/python/paddle/tests/test_data_pipeline.py new file mode 100644 index 00000000000000..2c8e3124503ef2 --- /dev/null +++ b/python/paddle/tests/test_data_pipeline.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.utils.download import get_path_from_url +from paddle.vision.datasets import DatasetFolder +from paddle.vision.ops import image_decode_random_crop, image_resize, \ + random_flip, mirror_normalize +from paddle.vision.reader import file_label_reader + + +DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") +DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" +DATASET_MD5 = "c7110519124a433901cf005a4a91b607" +IMAGE_NUM = 100 + + +class TestDataPipelineCase1(unittest.TestCase): + def setUp(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.num_epoches= 2 + self.batch_size = 16 + self.num_threads = 2 + self.host_memory_padding = 1000000 + self.device_memory_padding = 1000000 + + self.shuffle = False + self.drop_last = True + self.calc_iter_info() + + self.target_size = 224 + self.flip_prob = 0.5 + self.mean = [123.675, 116.28, 103.53] + self.std = [58.395, 57.120, 57.375] + + self.mean_np = np.array(self.mean).reshape([1, 3, 1, 1]) + self.std_np = np.array(self.std).reshape([1, 3, 1, 1]) + + self.build_reader() + + def calc_iter_info(self): + if self.drop_last: + self.num_iters = IMAGE_NUM // self.batch_size + else: + self.num_iters = (IMAGE_NUM + self.batch_size - 1) \ + // self.batch_size + + if self.drop_last: + self.last_batch_size = self.batch_size + else: + self.last_batch_size = IMAGE_NUM % self.batch_size + if self.last_batch_size == 0: + self.last_batch_size = self.batch_size + + def build_reader(self): + def imagenet_reader(): + image, label = file_label_reader(self.data_root, + batch_size=self.batch_size, + shuffle=self.shuffle, + drop_last=self.drop_last) + def decode(image): + image = image_decode_random_crop( + image, num_threads=self.num_threads) + return image + + def resize(image): + image = image_resize(image, size=self.target_size) + return image + + def flip_normalize(image): + mirror = random_flip(image, prob=self.flip_prob) + image = mirror_normalize(image, mirror, + mean=self.mean, + std=self.std) + return image + + image = paddle.io.map(decode, image) + image = paddle.io.map(resize, image) + image = paddle.io.map(flip_normalize, image) + + return {'image': image, 'label': label} + + self.reader = imagenet_reader + + def test_static_output(self): + loader = paddle.io.DataLoader(self.reader) + + for eid in range(self.num_epoches): + num_iters = 0 + for data in loader: + image = data['image'].numpy() + assert image.shape[0] == self.batch_size + assert image.shape[1] == 3 + assert image.shape[2] == self.target_size + assert image.shape[3] == self.target_size + assert image.dtype == np.float32 + + restore_image = image * self.std_np + self.mean_np + assert np.all(restore_image > -1.) + assert np.all(restore_image < 256.) + + label = data['label'].numpy() + assert label.shape[0] == self.batch_size + assert label.dtype == np.int64 + assert np.all(label >= 0) + assert np.all(label <= 1) + + num_iters += 1 + + assert num_iters == self.num_iters + if eid < self.num_epoches - 1: + loader.reset() + + del loader + + def test_shutdown(self): + loader = paddle.io.DataLoader(self.reader) + core._shutdown_all_dataloaders() + + +class TestDataPipelineCase2(TestDataPipelineCase1): + def setUp(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.num_epoches= 1 + self.batch_size = 32 + self.num_threads = 4 + self.host_memory_padding = 0 + self.device_memory_padding = 0 + + self.shuffle = True + self.drop_last = True + self.calc_iter_info() + + self.target_size = 128 + self.flip_prob = 0.5 + self.mean = [123.675, 116.28, 103.53] + self.std = [58.395, 57.120, 57.375] + + self.mean_np = np.array(self.mean).reshape([1, 3, 1, 1]) + self.std_np = np.array(self.std).reshape([1, 3, 1, 1]) + + self.build_reader() + + +if __name__ == '__main__': + unittest.main() From 0206f3f983a6373f9465fa9c5e3ba06b70104e67 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 2 Apr 2022 16:04:55 +0000 Subject: [PATCH 88/95] add dynamic unittest for all data pipeline ops. test=develop --- paddle/fluid/operators/data/CMakeLists.txt | 6 +- .../fluid/operators/data/batch_decode_op.cc | 26 ++-- .../fluid/operators/data/batch_decode_op.cu | 21 ++- .../data/batch_decode_random_crop_op.cc | 15 +- .../data/batch_random_crop_and_resize_op.cc | 8 +- .../data/batch_random_crop_and_resize_op.cu | 16 +- .../data/random_crop_and_resize_op.cc | 124 --------------- ...a_api_random_flip.py => test_data_apis.py} | 9 ++ python/paddle/tests/test_ops_crop_resize.py | 147 +++++++----------- python/paddle/tests/test_ops_decode.py | 122 +++++++++++++-- .../tests/test_ops_file_label_loader.py | 4 +- .../paddle/tests/test_ops_mirror_normalize.py | 12 +- python/paddle/vision/ops.py | 18 +-- 13 files changed, 229 insertions(+), 299 deletions(-) delete mode 100644 paddle/fluid/operators/data/random_crop_and_resize_op.cc rename python/paddle/tests/{test_data_api_random_flip.py => test_data_apis.py} (89%) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 5e242b343bfcdc..925a1b9a8cdd81 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -15,14 +15,16 @@ op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) -cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) -cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) if (WITH_GPU) + cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) + cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) + op_library(batch_decode_random_crop_op SRCS batch_decode_random_crop_op.cc batch_decode_random_crop_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) op_library(batch_decode_op SRCS batch_decode_op.cc batch_decode_op.cu DEPS image_decoder ${OP_HEADER_DEPS}) op_library(batch_random_crop_and_resize_op SRCS batch_random_crop_and_resize_op.cc batch_random_crop_and_resize_op.cu DEPS ${OP_HEADER_DEPS}) op_library(batch_resize_op SRCS batch_resize_op.cc batch_resize_op.cu DEPS ${OP_HEADER_DEPS}) + op_library(mirror_normalize_op SRCS mirror_normalize_op.cc mirror_normalize_op.cu DEPS ${OP_HEADER_DEPS}) endif() diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index bb636ed1d267d4..e2f3675bc6b34c 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -23,8 +23,12 @@ class BatchDecodeOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(X) of DecodeJpeg should not be empty.")); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of DecodeJpeg should not be empty.")); } protected: @@ -46,28 +50,22 @@ class BatchDecodeOp : public framework::OperatorWithKernel { } }; -class BatchDecodeInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - } -}; - class BatchDecodeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "A one dimensional uint8 tensor containing the raw bytes " - "of the JPEG image. It is a tensor with rank 1."); - AddOutput("Out", "The output tensor of DecodeJpeg op").AsDuplicable(); + "(List[Tensor]) A one dimensional uint8 tensor containing " + "the raw bytes of the JPEG image. It is a tensor with rank " + "1.").AsDuplicable(); + AddOutput("Out", "The output tensor of BatchDecodeOp").AsDuplicable(); AddComment(R"DOC( This operator decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 between 0 and 255. )DOC"); - AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); + AddAttr("num_threads", "Path of the file to be readed.") + .SetDefault(2); AddAttr("local_rank", "(int)" "The index of the op to start execution"); diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index 0b640bbb3b986f..b587fc300277b6 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -42,23 +42,22 @@ class GPUBatchDecodeKernel : public framework::OpKernel { static_cast(host_memory_padding), static_cast(device_memory_padding)); - const framework::LoDTensorArray* inputs = - ctx.Input("X"); + auto inputs = ctx.MultiInput("X"); + int batch_size = inputs.size(); - auto* out = ctx.OutputVar("Out"); - auto& out_array = *out->GetMutable(); - out_array.resize(inputs->size()); + auto out_array = ctx.MultiOutput("Out"); + auto dev = platform::CUDAPlace(local_rank); - for (size_t i = 0; i < inputs->size(); i++) { - const framework::LoDTensor x = inputs->at(i); - auto* x_data = x.data(); - size_t x_numel = static_cast(x.numel()); + for (size_t i = 0; i < batch_size; i++) { + const framework::LoDTensor* x = inputs.at(i); + auto* x_data = x->data(); + size_t x_numel = static_cast(x->numel()); ImageDecodeTask task = {.bit_stream = x_data, .bit_len = x_numel, - .tensor = &out_array[i], + .tensor = out_array[i], .roi_generator = nullptr, - .place = ctx.GetPlace()}; + .place = dev}; decode_pool->AddTask(std::make_shared(task)); } diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 508802154d25f7..020c400eb5992a 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -95,22 +95,15 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel { } }; -class BatchDecodeRandomCropInferVarType : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const override { - ctx->SetOutputType("Out", framework::proto::VarType::LOD_TENSOR_ARRAY, - framework::ALL_ELEMENTS); - } -}; - class BatchDecodeRandomCropOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "A one dimensional uint8 tensor containing the raw bytes " - "of the JPEG image. It is a tensor with rank 1.") + "(List[Tensor]) A one dimensional uint8 tensor containing the " + "raw bytes of the JPEG image. It is a tensor with rank 1.") + .AsDuplicable(); + AddOutput("Out", "The output tensor of BatchDecodeRandomCropOp") .AsDuplicable(); - AddOutput("Out", "The output tensor of DecodeJpeg op").AsDuplicable(); AddComment(R"DOC( This operator decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. Optionally converts the image to the diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc index b644541786083e..ee2c0596731da4 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc @@ -24,7 +24,10 @@ class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchRandomCropAndResize"); + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(X) of BatchRandomCropAndResize " + "should not be empty.")); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BatchRandomCropAndResize"); @@ -62,7 +65,8 @@ class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { class BatchRandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); + AddInput("X", "(List(Tensor)). A batch of instances to random crop.") + .AsDuplicable(); AddOutput("Out", "(Tensor). The cropped instance batch."); AddAttr("aspect_ratio_min", "").SetDefault(3./4.); AddAttr("aspect_ratio_max", "").SetDefault(4./3.); diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu index a663600dad5e04..a0378115f90716 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu @@ -276,8 +276,8 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::NotFound("This kernel only runs on GPU device.")); // get input, output - auto* x = ctx.Input("X"); - PADDLE_ENFORCE_GT(x->size(), 0, + auto x = ctx.MultiInput("X"); + PADDLE_ENFORCE_GT(x.size(), 0, platform::errors::InvalidArgument( "The size of X must be greater than 0.")); auto* out = ctx.Output("Out"); @@ -291,7 +291,7 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { AreaRange area_range{area_min, area_max}; auto* generators = GeneratorManager::Instance()->GetGenerators( - x->size(), x->size(), aspect_ratio_range, + x.size(), x.size(), aspect_ratio_range, area_range); const std::vector size = ctx.Attr>("size"); @@ -305,22 +305,22 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { bool align_corners = ctx.Attr("align_corners"); int align_mode = ctx.Attr("align_mode"); - auto* img = &x->at(0); + auto* img = x.at(0); int64_t img_c = data_format == DataLayout::kNCHW ? \ img->dims()[0] : img->dims()[2]; std::vector out_dim; if (data_format == DataLayout::kNCHW) { - out_dim = {static_cast(x->size()), img_c, size[0], size[1]}; + out_dim = {static_cast(x.size()), img_c, size[0], size[1]}; } else { - out_dim = {static_cast(x->size()), size[0], size[1], img_c}; + out_dim = {static_cast(x.size()), size[0], size[1], img_c}; } out->Resize(phi::make_ddim(out_dim)); out->mutable_data(ctx.GetPlace()); int img_h, img_w, idx_h, idx_w, crop_h, crop_w; - for (int i = 0; i < x->size(); i++) { - img = &x->at(i); + for (int i = 0; i < x.size(); i++) { + img = x.at(i); img_h = data_format == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0]; img_w = diff --git a/paddle/fluid/operators/data/random_crop_and_resize_op.cc b/paddle/fluid/operators/data/random_crop_and_resize_op.cc deleted file mode 100644 index f77be6f27bba62..00000000000000 --- a/paddle/fluid/operators/data/random_crop_and_resize_op.cc +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/data/random_crop_and_resize_op.h" - -namespace paddle { -namespace operators { -namespace data { - -class RandomCropAndResizeOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "RandomCropAndResize"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "RandomCropAndResize"); - - auto size = ctx->Attrs().Get>("size"); - PADDLE_ENFORCE_EQ(size.size(), 2, - platform::errors::InvalidArgument( - "The length of Attrs(size) should be 2.")); - PADDLE_ENFORCE_GT(size[0], 0, - platform::errors::InvalidArgument( - "h in Attr(size) of Op(RandomCropAndResize) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(size[1], 0, - platform::errors::InvalidArgument( - "w in Attr(size) of Op(RandomCropAndResize) " - "should be greater than 0.")); - } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::UINT8, ctx.GetPlace()); - } - - framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const override { - if (var_name == "X") { - return expected_kernel_type; - } - return framework::OpKernelType(expected_kernel_type.data_type_, - tensor.place(), tensor.layout()); - } -}; - -class RandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(LoDTensorArray). A batch of instances to random crop."); - AddOutput("Out", "(Tensor). The cropped instance batch."); - AddAttr>( - "size", "expected output size of the crop, for each edge."); - AddAttr>( - "scale", - "Specifies the lower and upper bounds" - "for the random area of the crop, before resizing."); - AddAttr>( - "ratio", - "lower and upper bounds for the random aspect ratio of the crop, " - "before resizing."); - AddAttr("interp_method", - "(string, default \"bilinear\"), interpolation " - "method, can be \"bilinear\" for " - "bilinear interpolation and \"nearest\" for nearest " - "neighbor interpolation.") - .SetDefault("bilinear"); - AddAttr( - "align_corners", - "an optional bool. Defaults to True. " - "If True, the centers of 4 corner pixels of the input and output " - "tensors are aligned, preserving the values at the corner pixels, " - "If False, are not aligned") - .SetDefault(true); - AddAttr("align_mode", - "(int, default \'1\'), optional for bilinear interpolation, " - "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " - "can be \'1\' for src_idx = scale*dst_index .") - .SetDefault(1); - AddAttr( - "data_layout", - "(string, default NCHW) Only used in " - "an optional string from: \"NHWC\", \"NCHW\". " - "Specify that the data format of the input and output data is " - "channel_first or channel_last.") - .SetDefault("NCHW"); - AddAttr("seed", "The random seed. ").SetDefault(0); - AddComment(R"DOC( - Crop the input data to random size and aspect ratio. - A crop of random size (default: of 0.08 to 1.0) of the original size and a random - aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. - After applying crop transfrom, the input data will be resized to given size. - )DOC"); - } -}; - -} // namespace data -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR( - random_crop_and_resize, ops::data::RandomCropAndResizeOp, - ops::data::RandomCropAndResizeOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(random_crop_and_resize, - ops::data::RandomCropAndResizeCPUKernel) diff --git a/python/paddle/tests/test_data_api_random_flip.py b/python/paddle/tests/test_data_apis.py similarity index 89% rename from python/paddle/tests/test_data_api_random_flip.py rename to python/paddle/tests/test_data_apis.py index 0a24bbe0625adf..e9ea5874f36cc7 100644 --- a/python/paddle/tests/test_data_api_random_flip.py +++ b/python/paddle/tests/test_data_apis.py @@ -32,6 +32,15 @@ def test_errors(self): except ValueError: pass + try: + data = paddle.ones([16, 3, 32, 32], dtype="float32") + out = random_flip(data, -0.5) + + # should not execute following lines + assert False + except ValueError: + pass + def test_output_dynamic(self): data = paddle.ones([16, 3, 32, 32], dtype="float32") out = random_flip(data, 0.5) diff --git a/python/paddle/tests/test_ops_crop_resize.py b/python/paddle/tests/test_ops_crop_resize.py index 551851f97ae1df..a4911f9006af8f 100644 --- a/python/paddle/tests/test_ops_crop_resize.py +++ b/python/paddle/tests/test_ops_crop_resize.py @@ -165,7 +165,7 @@ def np_image_resize(images, size, interp_method, class TestImageResizeNearestNCHW(unittest.TestCase): - def setup(self): + def setUp(self): self.image_shape1 = [3, 32, 32] self.image_shape2 = [3, 16, 16] self.size = (20, 30) @@ -174,28 +174,25 @@ def setup(self): self.align_corners = False self.align_mode = 1 - self._is_np_built = False self.build_np_data() def build_np_data(self): - if not self._is_np_built: - self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") - self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") - self.np_result = np_image_resize( - [self.image1, self.image2], - size=self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) - self._is_np_built = True + self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") + self.np_result = np_image_resize( + [self.image1, self.image2], + size=self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) def test_output_dynamic(self): + # NOTE: only support cuda kernel currently if not core.is_compiled_with_cuda(): return paddle.disable_static() - self.setup() images = paddle.tensor.create_array(dtype="uint8") images = paddle.tensor.array_write(paddle.to_tensor(self.image1), @@ -203,36 +200,27 @@ def test_output_dynamic(self): images = paddle.tensor.array_write(paddle.to_tensor(self.image2), paddle.to_tensor(1), images) - # NOTE: image_resize takes TensorArray as input, which cannot - # create by Python API in dynamic mode - try: - dy_result = image_resize(images, self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) - except: - pass + result = image_resize(images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + assert np.allclose(result.numpy(), self.np_result, rtol=1) def test_output_static(self): + # NOTE: only support cuda kernel currently if not core.is_compiled_with_cuda(): return paddle.enable_static() - self.setup() - - images = paddle.tensor.create_array(dtype="uint8") - idx = fluid.layers.fill_constant(shape=[1], dtype="int64", value=0) image1 = fluid.layers.assign(self.image1.astype('int32')) image1 = fluid.layers.cast(image1, dtype='uint8') - images = paddle.tensor.array_write(image1, idx, images) image2 = fluid.layers.assign(self.image2.astype('int32')) image2 = fluid.layers.cast(image2, dtype='uint8') - images = paddle.tensor.array_write(image2, idx + 1, images) - out = image_resize(images, self.size, + out = image_resize([image1, image2], self.size, interp_method=self.interp_method, align_corners=self.align_corners, align_mode=self.align_mode, @@ -247,7 +235,7 @@ def test_output_static(self): class TestImageResizeNearestNHWC(TestImageResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [32, 32, 3] self.image_shape2 = [16, 16, 3] self.size = 20 @@ -256,15 +244,11 @@ def setup(self): self.align_corners = True self.align_mode = 1 - self._is_np_built = False self.build_np_data() - def test_output_dynamic(self): - pass - class TestImageResizeNearestNCHWAlignCorner(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [3, 32, 32] self.image_shape2 = [3, 16, 16] self.size = 30 @@ -273,12 +257,11 @@ def setup(self): self.align_corners = True self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageResizeNearestNHWCAlignCorner(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [32, 32, 3] self.image_shape2 = [16, 16, 3] self.size = (20, 30) @@ -287,12 +270,11 @@ def setup(self): self.align_corners = True self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNCHW(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [3, 32, 32] self.image_shape2 = [3, 16, 16] self.size = (20, 30) @@ -301,12 +283,11 @@ def setup(self): self.align_corners = False self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNHWC(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [32, 32, 3] self.image_shape2 = [16, 16, 3] self.size = (20, 30) @@ -315,12 +296,11 @@ def setup(self): self.align_corners = False self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNCHWAlignMode0(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [3, 32, 32] self.image_shape2 = [3, 16, 16] self.size = (20, 30) @@ -329,12 +309,11 @@ def setup(self): self.align_corners = False self.align_mode = 0 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNHWCAlignMode0(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [32, 32, 3] self.image_shape2 = [16, 16, 3] self.size = (20, 30) @@ -343,12 +322,11 @@ def setup(self): self.align_corners = False self.align_mode = 0 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNCHWAlignCorner(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [3, 32, 32] self.image_shape2 = [3, 16, 16] self.size = (20, 30) @@ -357,12 +335,11 @@ def setup(self): self.align_corners = True self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageResizeBilinearNHWCAlignCorner(TestImageResizeNearestNHWC): - def setup(self): + def setUp(self): self.image_shape1 = [32, 32, 3] self.image_shape2 = [16, 16, 3] self.size = (20, 30) @@ -371,12 +348,11 @@ def setup(self): self.align_corners = True self.align_mode = 1 - self._is_np_built = False self.build_np_data() class TestImageCropResizeNearestNCHW(unittest.TestCase): - def setup(self): + def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] self.size = (20, 30) @@ -387,21 +363,18 @@ def setup(self): self.out_shape = (2, 3, 20, 30) - self._is_np_built = False self.build_np_data() def build_np_data(self): - if not self._is_np_built: - self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") - self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") - self._is_np_built = True + self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") def test_output_dynamic(self): + # NOTE: only support cuda kernel currently if not core.is_compiled_with_cuda(): return paddle.disable_static() - self.setup() images = paddle.tensor.create_array(dtype="uint8") images = paddle.tensor.array_write(paddle.to_tensor(self.image1), @@ -409,38 +382,31 @@ def test_output_dynamic(self): images = paddle.tensor.array_write(paddle.to_tensor(self.image2), paddle.to_tensor(1), images) - # NOTE: image_resize takes TensorArray as input, which cannot - # create by Python API in dynamic mode - try: - dy_result = random_crop_and_resize( - images, self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) - except: - pass + result = random_crop_and_resize( + images, self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) + result = result.numpy() + assert result.shape == self.out_shape + assert result.dtype == np.uint8 def test_output_static(self): + # NOTE: only support cuda kernel currently if not core.is_compiled_with_cuda(): return paddle.enable_static() - self.setup() - images = paddle.tensor.create_array(dtype="uint8") - - idx = fluid.layers.fill_constant(shape=[1], dtype="int64", value=0) image1 = fluid.layers.assign(self.image1.astype('int32')) image1 = fluid.layers.cast(image1, dtype='uint8') - images = paddle.tensor.array_write(image1, idx, images) image2 = fluid.layers.assign(self.image2.astype('int32')) image2 = fluid.layers.cast(image2, dtype='uint8') - images = paddle.tensor.array_write(image2, idx + 1, images) out = random_crop_and_resize( - images, self.size, + [image1, image2], self.size, interp_method=self.interp_method, align_corners=self.align_corners, align_mode=self.align_mode, @@ -450,12 +416,13 @@ def test_output_static(self): result, = exe.run(paddle.static.default_main_program(), fetch_list=[out]) assert result.shape == self.out_shape + assert result.dtype == np.uint8 paddle.disable_static() class TestImageCropResizeNearestNHWC(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [16, 16, 3] self.image_shape2 = [32, 32, 3] self.size = 20 @@ -466,12 +433,11 @@ def setup(self): self.out_shape = (2, 20, 20, 3) - self._is_np_built = False self.build_np_data() class TestImageCropResizeNearestNCHWAlignCorner(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] self.size = 20 @@ -482,12 +448,11 @@ def setup(self): self.out_shape = (2, 3, 20, 20) - self._is_np_built = False self.build_np_data() class TestImageCropResizeNearestNHWCAlignCorner(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [16, 16, 3] self.image_shape2 = [32, 32, 3] self.size = (20, 30) @@ -498,12 +463,11 @@ def setup(self): self.out_shape = (2, 20, 30, 3) - self._is_np_built = False self.build_np_data() class TestImageCropResizeBilinearNCHW(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] self.size = (20, 30) @@ -519,7 +483,7 @@ def setup(self): class TestImageCropResizeNearestNHWC(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [16, 16, 3] self.image_shape2 = [32, 32, 3] self.size = (20, 30) @@ -530,12 +494,11 @@ def setup(self): self.out_shape = (2, 20, 30, 3) - self._is_np_built = False self.build_np_data() class TestImageCropResizeBilinearNCHWAlignMode0(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] self.size = (20, 30) @@ -546,12 +509,11 @@ def setup(self): self.out_shape = (2, 3, 20, 30) - self._is_np_built = False self.build_np_data() class TestImageCropResizeNearestNHWCAlignMode0(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [16, 16, 3] self.image_shape2 = [32, 32, 3] self.size = (20, 30) @@ -562,12 +524,11 @@ def setup(self): self.out_shape = (2, 20, 30, 3) - self._is_np_built = False self.build_np_data() class TestImageCropResizeBilinearNCHWAlignCorner(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] self.size = (20, 30) @@ -578,12 +539,11 @@ def setup(self): self.out_shape = (2, 3, 20, 30) - self._is_np_built = False self.build_np_data() class TestImageCropResizeNearestNHWCAlignCorner(TestImageCropResizeNearestNCHW): - def setup(self): + def setUp(self): self.image_shape1 = [16, 16, 3] self.image_shape2 = [32, 32, 3] self.size = (20, 30) @@ -594,7 +554,6 @@ def setup(self): self.out_shape = (2, 20, 30, 3) - self._is_np_built = False self.build_np_data() diff --git a/python/paddle/tests/test_ops_decode.py b/python/paddle/tests/test_ops_decode.py index 95128be5f240cc..18879f75aaa5c3 100644 --- a/python/paddle/tests/test_ops_decode.py +++ b/python/paddle/tests/test_ops_decode.py @@ -32,7 +32,7 @@ class TestImageReaderDecodeCase1(unittest.TestCase): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) @@ -42,16 +42,21 @@ def setup(self): self.device_memory_padding = 1000000 def test_static_output(self): + # NOTE: only support cuda kernel currently + if not core.is_compiled_with_cuda(): + return + paddle.enable_static() - self.setup() indices = paddle.arange(self.batch_size) - image, label = file_label_loader(self.data_root, indices) - image = image_decode(image, - num_threads=self.num_threads) + image, label = file_label_loader(self.data_root, indices, + self.batch_size) + image = image_decode(image, num_threads=self.num_threads) exe = paddle.static.Executor(paddle.CUDAPlace(0)) - out_image, out_label = exe.run(paddle.static.default_main_program(), - fetch_list=[image, label]) + rets = exe.run(paddle.static.default_main_program(), + fetch_list=image + [label]) + out_image = rets[:-1] + out_label = rets[-1] assert len(out_image) == self.batch_size for i in range(self.batch_size): @@ -61,15 +66,41 @@ def test_static_output(self): assert np.all(img >= 0) assert np.all(img <= 255) - assert len(out_label) == self.batch_size - assert label.dtype == paddle.int64 label = np.array(out_label) + assert label.dtype == np.int64 + assert label.shape[0] == self.batch_size + assert np.all(label >= 0) + assert np.all(label <= 1) + + paddle.disable_static() + + def test_dynamic_output(self): + # NOTE: only support cuda kernel currently + if not core.is_compiled_with_cuda(): + return + + indices = paddle.arange(self.batch_size) + image, label = file_label_loader(self.data_root, indices, + self.batch_size) + image = image_decode(image, num_threads=self.num_threads) + + assert len(image) == self.batch_size + for i in range(self.batch_size): + img = image[i].numpy() + assert img.dtype == np.uint8 + assert img.shape[2] == 3 + assert np.all(img >= 0) + assert np.all(img <= 255) + + label = label.numpy() + assert label.dtype == np.int64 + assert label.shape[0] == self.batch_size assert np.all(label >= 0) assert np.all(label <= 1) class TestImageReaderDecodeCase2(TestImageReaderDecodeCase1): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) @@ -80,7 +111,7 @@ def setup(self): class TestImageReaderDecodeRandomCropNCHW(unittest.TestCase): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) @@ -99,11 +130,15 @@ def setup(self): self.channel_dim = 0 def test_static_output(self): + # NOTE: only support cuda kernel currently + if not core.is_compiled_with_cuda(): + return + paddle.enable_static() - self.setup() indices = paddle.arange(self.batch_size) - image, label = file_label_loader(self.data_root, indices) + image, label = file_label_loader(self.data_root, indices, + self.batch_size) image = image_decode_random_crop(image, num_threads=self.num_threads, aspect_ratio_min=self.aspect_ratio_min, @@ -113,8 +148,10 @@ def test_static_output(self): num_attempts=self.num_attempts, data_format=self.data_format) exe = paddle.static.Executor(paddle.CUDAPlace(0)) - out_image, out_label = exe.run(paddle.static.default_main_program(), - fetch_list=[image, label]) + rets = exe.run(paddle.static.default_main_program(), + fetch_list=image + [label]) + out_image = rets[:-1] + out_label = rets[-1] assert len(out_image) == self.batch_size for i in range(self.batch_size): @@ -130,9 +167,42 @@ def test_static_output(self): assert np.all(label >= 0) assert np.all(label <= 1) + paddle.disable_static() + + def test_dynamic_output(self): + # NOTE: only support cuda kernel currently + if not core.is_compiled_with_cuda(): + return + + indices = paddle.arange(self.batch_size) + image, label = file_label_loader(self.data_root, indices, + self.batch_size) + image = image_decode_random_crop(image, + num_threads=self.num_threads, + aspect_ratio_min=self.aspect_ratio_min, + aspect_ratio_max=self.aspect_ratio_max, + area_min=self.area_min, + area_max=self.area_max, + num_attempts=self.num_attempts, + data_format=self.data_format) + + assert len(image) == self.batch_size + for i in range(self.batch_size): + img = image[i].numpy() + assert img.dtype == np.uint8 + assert img.shape[self.channel_dim] == 3 + assert np.all(img >= 0) + assert np.all(img <= 255) + + label = label.numpy() + assert label.shape[0] == self.batch_size + assert label.dtype == np.int64 + assert np.all(label >= 0) + assert np.all(label <= 1) + class TestImageReaderDecodeRandomCropNHWC(TestImageReaderDecodeRandomCropNCHW): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) @@ -151,5 +221,25 @@ def setup(self): self.channel_dim = 2 +class TestImageReaderDecodeRandomCropThread8(TestImageReaderDecodeRandomCropNCHW): + def setUp(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.batch_size = 16 + self.num_threads = 8 + self.host_memory_padding = 20000 + self.device_memory_padding = 20000 + + self.aspect_ratio_min = 1. / 2. + self.aspect_ratio_max = 3. / 2. + self.area_min = 0.01 + self.area_max = 0.99 + self.num_attempts = 50 + + self.data_format = "NCHW" + self.channel_dim = 0 + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/tests/test_ops_file_label_loader.py b/python/paddle/tests/test_ops_file_label_loader.py index 4f6a3a9633796b..91f34dcfdb7677 100644 --- a/python/paddle/tests/test_ops_file_label_loader.py +++ b/python/paddle/tests/test_ops_file_label_loader.py @@ -47,14 +47,14 @@ def build_program(self): paddle.enable_static() self.indices_data = paddle.static.data( shape=[self.batch_size], dtype='int64', name='indices') - self.sample_data, self.label_data = file_label_loader(self.data_root, self.indices_data) + self.sample_data, self.label_data = file_label_loader(self.data_root, self.indices_data, self.batch_size) self.exe = paddle.static.Executor(paddle.CPUPlace()) paddle.disable_static() def loader_function(self, indices): if paddle.in_dynamic_mode(): indices = paddle.to_tensor(indices) - return file_label_loader(self.data_root, indices) + return file_label_loader(self.data_root, indices, self.batch_size) else: paddle.enable_static() return self.exe.run(paddle.static.default_main_program(), diff --git a/python/paddle/tests/test_ops_mirror_normalize.py b/python/paddle/tests/test_ops_mirror_normalize.py index ec172355cb8256..0c3270aea09148 100644 --- a/python/paddle/tests/test_ops_mirror_normalize.py +++ b/python/paddle/tests/test_ops_mirror_normalize.py @@ -45,7 +45,7 @@ def np_mirror_normalize(image, mirror, mean, std): class TestMirrorNormalize(unittest.TestCase): - def setup(self): + def setUp(self): self.image_shape = [16, 3, 32, 32] self.mirror_shape = [16, 1] self.mean = [123.675, 116.28, 103.53] @@ -62,14 +62,16 @@ def test_check_output_dynamic(self): if not core.is_compiled_with_cuda(): return - self.setup() dy_result = mirror_normalize(paddle.to_tensor(self.image), paddle.to_tensor(self.mirror), self.mean, self.std) assert np.allclose(self.result, dy_result.numpy()) def test_check_output_static(self): - self.setup() + # NOTE: only supoort CUDA kernel currently + if not core.is_compiled_with_cuda(): + return + paddle.enable_static() image_data = paddle.static.data(shape=self.image_shape, @@ -99,7 +101,7 @@ def test_check_output_static(self): class TestMirrorNormalizeSingleMeanStd(TestMirrorNormalize): - def setup(self): + def setUp(self): self.image_shape = [16, 3, 32, 32] self.mirror_shape = [16, 1] self.mean = [123.675] @@ -113,7 +115,7 @@ def setup(self): class TestMirrorNormalizeFloatMeanStd(TestMirrorNormalize): - def setup(self): + def setUp(self): self.image_shape = [16, 3, 32, 32] self.mirror_shape = [16, 1] self.mean = 123.675 diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 9ee40065763b9c..daaa855329bcf3 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -934,10 +934,12 @@ def image_decode(x, } helper = LayerHelper("batch_decode", **locals()) - out = helper.create_variable( - name=unique_name.generate("image_decode"), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype=x.dtype) + out = [ + helper.create_variable( + name=unique_name.generate("image_decode"), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='uint8') for i in range(len(x)) + ] helper.append_op( type="batch_decode", inputs=inputs, attrs=attrs, outputs={"Out": out}) @@ -1019,14 +1021,10 @@ def image_decode_random_crop(x, "program_id": utils._hash_with_id(default_main_program()) } - helper = LayerHelper("batch_decode_random_crop", **locals()) - # out = helper.create_variable( - # name=unique_name.generate("image_decode_random_crop"), - # type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - # dtype=x.dtype) + helper = LayerHelper("image_decode_random_crop", **locals()) out = [ helper.create_variable( - name=unique_name.generate("file_label_loader"), + name=unique_name.generate("image_decode_random_crop"), type=core.VarDesc.VarType.LOD_TENSOR, dtype='uint8') for i in range(len(x)) ] From 4c91cd90b5f08b33635eadccaa20c9b269c1747e Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 3 Apr 2022 14:06:27 +0000 Subject: [PATCH 89/95] fix ci compile. test=develop --- .../framework/ir/dataloader_queue_pass.cc | 34 +-- paddle/fluid/framework/operator.cc | 7 +- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/data/CMakeLists.txt | 5 +- .../fluid/operators/data/batch_decode_op.cc | 6 +- .../fluid/operators/data/batch_decode_op.cu | 11 +- paddle/fluid/operators/data/batch_decode_op.h | 5 +- .../data/batch_decode_random_crop_op.cu | 22 +- .../data/batch_decode_random_crop_op.h | 4 +- .../data/batch_random_crop_and_resize_op.cc | 15 +- .../data/batch_random_crop_and_resize_op.cu | 18 +- .../data/batch_random_crop_and_resize_op.h | 1 - paddle/fluid/operators/data/batch_resize_op.h | 3 +- paddle/fluid/operators/data/data_reader_op.cc | 38 ++- paddle/fluid/operators/data/data_reader_op.h | 255 +++++++++--------- paddle/fluid/operators/data/dataloader_op.h | 3 +- .../operators/data/file_label_loader_op.cc | 3 - .../operators/data/file_label_loader_op.h | 1 - paddle/fluid/operators/data/image_decoder.cc | 1 - paddle/fluid/operators/data/image_decoder.h | 9 +- paddle/fluid/operators/data/map_op.cc | 27 +- paddle/fluid/operators/data/map_op.cu.cc | 3 +- paddle/fluid/operators/data/map_op.h | 30 +-- paddle/fluid/operators/data/map_runner.cc | 96 +++---- paddle/fluid/operators/data/map_runner.h | 50 ++-- .../operators/data/mirror_normalize_op.cc | 59 ++-- .../operators/data/mirror_normalize_op.cu | 20 +- paddle/fluid/operators/data/pipeline.cc | 20 +- paddle/fluid/operators/data/pipeline.h | 35 +-- .../operators/data/random_roi_generator.cc | 31 +-- .../operators/data/random_roi_generator.h | 97 ++++--- paddle/fluid/operators/data/utils.h | 20 +- paddle/fluid/operators/split_lod_tensor_op.cc | 42 ++- paddle/fluid/platform/device_context.cc | 55 +++- paddle/fluid/platform/device_context.h | 9 +- paddle/fluid/platform/dynload/nvjpeg.h | 56 ++-- paddle/fluid/platform/enforce.h | 62 +++-- paddle/fluid/pybind/protobuf.cc | 3 +- paddle/fluid/pybind/pybind.cc | 10 +- paddle/phi/backends/dynload/nvjpeg.h | 56 ++-- python/paddle/fluid/core.py | 2 +- python/paddle/fluid/dataloader/ops.py | 54 ++-- python/paddle/fluid/dataloader/pipeline.py | 12 +- python/paddle/fluid/reader.py | 2 + python/paddle/tests/CMakeLists.txt | 8 + python/paddle/tests/test_data_apis.py | 11 +- python/paddle/tests/test_data_pipeline.py | 36 ++- python/paddle/tests/test_ops_crop_resize.py | 129 +++++---- python/paddle/tests/test_ops_decode.py | 40 +-- .../tests/test_ops_file_label_loader.py | 33 ++- .../paddle/tests/test_ops_mirror_normalize.py | 63 +++-- 51 files changed, 832 insertions(+), 782 deletions(-) diff --git a/paddle/fluid/framework/ir/dataloader_queue_pass.cc b/paddle/fluid/framework/ir/dataloader_queue_pass.cc index dc9e7ac024cfa2..8f3a902815da7a 100644 --- a/paddle/fluid/framework/ir/dataloader_queue_pass.cc +++ b/paddle/fluid/framework/ir/dataloader_queue_pass.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/ir/pass.h" @@ -25,14 +25,11 @@ namespace ir { class Graph; std::set output_queue_holder_ops = { - "file_label_reader", - "map", - "data_reader", + "file_label_reader", "map", "data_reader", }; std::set input_array_ops = { - "random_crop_and_resize", - "batch_decode", + "random_crop_and_resize", "batch_decode", }; static bool IsOutputQueueHolderOp(std::string op_type) { @@ -43,15 +40,15 @@ static bool IsInputArrayOp(std::string op_type) { return input_array_ops.find(op_type) != input_array_ops.end(); } -static void ProcessOutputQueueHolderOp(ir::Graph* graph) { +static void ProcessOutputQueueHolderOp(ir::Graph *graph) { std::set var_names; for (const Node *n : graph->Nodes()) { if (n->IsOp() && n->Op()) { auto *op = n->Op(); if (IsOutputQueueHolderOp(op->Type())) { - auto& outputs = op->Outputs(); + auto &outputs = op->Outputs(); for (auto iter = outputs.begin(); iter != outputs.end(); iter++) { - for (auto var: iter->second) var_names.insert(var); + for (auto var : iter->second) var_names.insert(var); } } } @@ -61,7 +58,8 @@ static void ProcessOutputQueueHolderOp(ir::Graph* graph) { if (n->IsVar() && n->Var()) { auto *var = n->Var(); if (var_names.find(var->Name()) != var_names.end()) { - VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; + VLOG(3) << "Change output variable type of " << var->Name() + << " to queue holder"; var->SetType(framework::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE); var->SetPersistable(true); } @@ -69,15 +67,15 @@ static void ProcessOutputQueueHolderOp(ir::Graph* graph) { } } -static void ProcessInputArrayOp(ir::Graph* graph) { +static void ProcessInputArrayOp(ir::Graph *graph) { std::set var_names; for (const Node *n : graph->Nodes()) { if (n->IsOp() && n->Op()) { auto *op = n->Op(); if (IsInputArrayOp(op->Type())) { - auto& inputs = op->Inputs(); + auto &inputs = op->Inputs(); for (auto iter = inputs.begin(); iter != inputs.end(); iter++) { - for (auto var: iter->second) var_names.insert(var); + for (auto var : iter->second) var_names.insert(var); } } } @@ -87,16 +85,17 @@ static void ProcessInputArrayOp(ir::Graph* graph) { if (n->IsVar() && n->Var()) { auto *var = n->Var(); if (var_names.find(var->Name()) != var_names.end()) { - VLOG(3) << "Change output variable type of " << var->Name() << " to queue holder"; + VLOG(3) << "Change output variable type of " << var->Name() + << " to queue holder"; var->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY); } } } } -class DataLoaderQueuePass: public Pass { +class DataLoaderQueuePass : public Pass { protected: - void ApplyImpl(ir::Graph* graph) const override { + void ApplyImpl(ir::Graph *graph) const override { ProcessOutputQueueHolderOp(graph); ProcessInputArrayOp(graph); } @@ -106,4 +105,5 @@ class DataLoaderQueuePass: public Pass { } // namespace framework } // namespace paddle -REGISTER_PASS(dataloader_queue_pass, paddle::framework::ir::DataLoaderQueuePass); +REGISTER_PASS(dataloader_queue_pass, + paddle::framework::ir::DataLoaderQueuePass); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6e325f652ed880..622628b4354ee9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1242,9 +1242,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place, RuntimeContext* runtime_ctx) const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = HasAttr("_stream_id") ? - platform::AsyncDeviceContextPool::Instance().Get( - place, Attr("_stream_id")) : nullptr; + auto* dev_ctx = HasAttr("_stream_id") + ? platform::AsyncDeviceContextPool::Instance().Get( + place, Attr("_stream_id")) + : nullptr; if (dev_ctx == nullptr) { dev_ctx = pool.Get(place); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 49f1a8ad8fbc94..aee9a794c15ccb 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,7 +22,6 @@ add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(string) add_subdirectory(jit) -add_subdirectory(data) if(WITH_MKLDNN) add_subdirectory(mkldnn) endif() @@ -42,6 +41,7 @@ add_subdirectory(reader) if (NOT WIN32) add_subdirectory(nccl) + add_subdirectory(data) endif() if (WITH_GPU AND TENSORRT_FOUND) diff --git a/paddle/fluid/operators/data/CMakeLists.txt b/paddle/fluid/operators/data/CMakeLists.txt index 925a1b9a8cdd81..d0ee1bff9e3f2a 100644 --- a/paddle/fluid/operators/data/CMakeLists.txt +++ b/paddle/fluid/operators/data/CMakeLists.txt @@ -13,10 +13,9 @@ op_library(data_reader_op SRCS data_reader_op.cc DEPS ${OP_HEADER_DEPS}) cc_library(map_runner SRCS map_runner.cc DEPS parallel_executor simple_threadpool scope) op_library(map_op SRCS map_op.cc map_op.cu.cc DEPS map_runner ${OP_HEADER_DEPS}) -op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) +if (WITH_GPU AND NOT WIN32) + op_library(file_label_loader_op SRCS file_label_loader_op.cc DEPS ${OP_HEADER_DEPS}) - -if (WITH_GPU) cc_library(random_roi_generator SRCS random_roi_generator.cc DEPS ${OP_HEADER_DEPS}) cc_library(image_decoder SRCS image_decoder.cc DEPS random_roi_generator ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index e2f3675bc6b34c..04a366dc6f3c23 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -56,7 +56,8 @@ class BatchDecodeOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(List[Tensor]) A one dimensional uint8 tensor containing " "the raw bytes of the JPEG image. It is a tensor with rank " - "1.").AsDuplicable(); + "1.") + .AsDuplicable(); AddOutput("Out", "The output tensor of BatchDecodeOp").AsDuplicable(); AddComment(R"DOC( This operator decodes a JPEG image into a 3 dimensional RGB Tensor @@ -64,8 +65,7 @@ or 1 dimensional Gray Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 between 0 and 255. )DOC"); - AddAttr("num_threads", "Path of the file to be readed.") - .SetDefault(2); + AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); AddAttr("local_rank", "(int)" "The index of the op to start execution"); diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu index b587fc300277b6..aa5596527e40ec 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cu +++ b/paddle/fluid/operators/data/batch_decode_op.cu @@ -53,11 +53,12 @@ class GPUBatchDecodeKernel : public framework::OpKernel { auto* x_data = x->data(); size_t x_numel = static_cast(x->numel()); - ImageDecodeTask task = {.bit_stream = x_data, - .bit_len = x_numel, - .tensor = out_array[i], - .roi_generator = nullptr, - .place = dev}; + ImageDecodeTask task; + task.bit_stream = x_data; + task.bit_len = x_numel; + task.tensor = out_array[i]; + task.roi_generator = nullptr; + task.place = dev; decode_pool->AddTask(std::make_shared(task)); } diff --git a/paddle/fluid/operators/data/batch_decode_op.h b/paddle/fluid/operators/data/batch_decode_op.h index e564a6ae942632..cb0b4382346adf 100644 --- a/paddle/fluid/operators/data/batch_decode_op.h +++ b/paddle/fluid/operators/data/batch_decode_op.h @@ -18,13 +18,12 @@ #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/data/image_decoder.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/operators/data/image_decoder.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu index 9e882ac8eac88c..dc06f0db496e06 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu @@ -79,21 +79,13 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel { auto* x_data = x->data(); size_t x_numel = static_cast(x->numel()); - if (data_format == DataLayout::kNCHW) { - ImageDecodeTask task = {.bit_stream = x_data, - .bit_len = x_numel, - .tensor = &temp_array[i], - .roi_generator = generators->at(i).get(), - .place = dev}; - decode_pool->AddTask(std::make_shared(task)); - } else { - ImageDecodeTask task = {.bit_stream = x_data, - .bit_len = x_numel, - .tensor = out_array[i], - .roi_generator = generators->at(i).get(), - .place = dev}; - decode_pool->AddTask(std::make_shared(task)); - } + ImageDecodeTask task; + task.bit_stream = x_data; + task.bit_len = x_numel; + task.roi_generator = generators->at(i).get(), task.place = dev; + task.tensor = + data_format == DataLayout::kNCHW ? &temp_array[i] : out_array[i]; + decode_pool->AddTask(std::make_shared(task)); } decode_pool->RunAll(true); diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.h b/paddle/fluid/operators/data/batch_decode_random_crop_op.h index 377dab7d4277c2..cda2c39ff89df9 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.h +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.h @@ -18,12 +18,12 @@ #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/data/image_decoder.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/operators/data/image_decoder.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc index ee2c0596731da4..c47c0a246d1b68 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc @@ -26,8 +26,8 @@ class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, platform::errors::InvalidArgument( - "Inputs(X) of BatchRandomCropAndResize " - "should not be empty.")); + "Inputs(X) of BatchRandomCropAndResize " + "should not be empty.")); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BatchRandomCropAndResize"); @@ -47,8 +47,8 @@ class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::proto::VarType::UINT8, ctx.GetPlace()); + return framework::OpKernelType(framework::proto::VarType::UINT8, + ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -62,14 +62,15 @@ class BatchRandomCropAndResizeOp : public framework::OperatorWithKernel { } }; -class BatchRandomCropAndResizeOpMaker : public framework::OpProtoAndCheckerMaker { +class BatchRandomCropAndResizeOpMaker + : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(List(Tensor)). A batch of instances to random crop.") .AsDuplicable(); AddOutput("Out", "(Tensor). The cropped instance batch."); - AddAttr("aspect_ratio_min", "").SetDefault(3./4.); - AddAttr("aspect_ratio_max", "").SetDefault(4./3.); + AddAttr("aspect_ratio_min", "").SetDefault(3. / 4.); + AddAttr("aspect_ratio_max", "").SetDefault(4. / 3.); AddAttr("area_min", "").SetDefault(0.08); AddAttr("area_max", "").SetDefault(1.); AddAttr("num_attempts", "").SetDefault(10); diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu index a0378115f90716..2b04e908fe4512 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu @@ -23,7 +23,8 @@ namespace data { using framework::LoDTensor; using DataLayout = framework::DataLayout; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; template __global__ void KeNearestNeighborInterpFw( @@ -291,8 +292,7 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { AreaRange area_range{area_min, area_max}; auto* generators = GeneratorManager::Instance()->GetGenerators( - x.size(), x.size(), aspect_ratio_range, - area_range); + x.size(), x.size(), aspect_ratio_range, area_range); const std::vector size = ctx.Attr>("size"); @@ -306,8 +306,8 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { int align_mode = ctx.Attr("align_mode"); auto* img = x.at(0); - int64_t img_c = data_format == DataLayout::kNCHW ? \ - img->dims()[0] : img->dims()[2]; + int64_t img_c = + data_format == DataLayout::kNCHW ? img->dims()[0] : img->dims()[2]; std::vector out_dim; if (data_format == DataLayout::kNCHW) { @@ -331,10 +331,10 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { // &crop_w, seed); auto out_tensor = out->Slice(i, i + 1); - BatchRandomCropAndResizeFwd( - ctx, *img, &out_tensor, size, interp_method, align_corners, - align_mode, img_h, img_w, img_c, roi.y, roi.x, roi.h, - roi.w, data_format); + BatchRandomCropAndResizeFwd(ctx, *img, &out_tensor, size, + interp_method, align_corners, align_mode, + img_h, img_w, img_c, roi.y, roi.x, roi.h, + roi.w, data_format); } } }; diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h index 6ab18f3f5e4a70..76c0a334c13efc 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.h @@ -41,4 +41,3 @@ class BatchRandomCropAndResizeCPUKernel : public framework::OpKernel { } // namespace data } // namespace operators } // namespace paddle - diff --git a/paddle/fluid/operators/data/batch_resize_op.h b/paddle/fluid/operators/data/batch_resize_op.h index edeafffb0dc695..cd39a8dd66272f 100644 --- a/paddle/fluid/operators/data/batch_resize_op.h +++ b/paddle/fluid/operators/data/batch_resize_op.h @@ -15,8 +15,8 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { @@ -35,4 +35,3 @@ class BatchResizeCPUKernel : public framework::OpKernel { } // namespace data } // namespace operators } // namespace paddle - diff --git a/paddle/fluid/operators/data/data_reader_op.cc b/paddle/fluid/operators/data/data_reader_op.cc index 45908a95b475a1..af64d5acaf8d51 100644 --- a/paddle/fluid/operators/data/data_reader_op.cc +++ b/paddle/fluid/operators/data/data_reader_op.cc @@ -19,7 +19,7 @@ namespace operators { namespace data { // initialization static variables out of ReaderManager -ReaderManager *ReaderManager::rm_instance_ptr_ = nullptr; +ReaderManager* ReaderManager::rm_instance_ptr_ = nullptr; std::mutex ReaderManager::m_; class DataReaderOp : public framework::OperatorBase { @@ -28,7 +28,7 @@ class DataReaderOp : public framework::OperatorBase { const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} + : OperatorBase(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext* ctx) const { OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "DataReaderOp"); @@ -36,7 +36,7 @@ class DataReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { + const platform::Place& dev_place) const override { auto outputs = Outputs("Out"); std::vector output_vars; output_vars.reserve(outputs.size()); @@ -61,8 +61,8 @@ class DataReaderOp : public framework::OperatorBase { auto output_queues = GetQueueVecFromVariableVec(output_vars); ReaderManager::Instance()->StartDataReader( reader_id, reader_block, &scope, platform::CPUPlace(), indices_var_name, - output_var_names, output_queues, batch_size, num_samples, - shuffle, drop_last, seed, rank, world_size); + output_var_names, output_queues, batch_size, num_samples, shuffle, + drop_last, seed, rank, world_size); } }; @@ -86,27 +86,24 @@ class DataReaderOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("batch_size", "The batch size for reading samples") .SetDefault(1); AddAttr("num_samples", "The sample number in dataset"); - AddAttr("shuffle", "Whether shuffle the dataset") - .SetDefault(false); + AddAttr("shuffle", "Whether shuffle the dataset").SetDefault(false); AddAttr("drop_last", "Whether drop last incomplete batch") .SetDefault(false); - AddAttr("seed", "Random seed for shuffle") - .SetDefault(0); - AddAttr("rank", "The logical rank of current device.") - .SetDefault(0); - AddAttr("world_size", "The number of running devices.") - .SetDefault(1); + AddAttr("seed", "Random seed for shuffle").SetDefault(0); + AddAttr("rank", "The logical rank of current device.").SetDefault(0); + AddAttr("world_size", "The number of running devices.").SetDefault(1); AddAttr("reader_id", "The unique id to generate and get reader"); AddAttr("reader_block", "(BlockDesc *)" "The global block of executed reader program " "desc."); AddAttr("indices_var_name", - "(string)" - "input variable names for sample indices"); - AddAttr>("output_var_names", - "(list of string)" - "output variable names for reader program"); + "(string)" + "input variable names for sample indices"); + AddAttr>( + "output_var_names", + "(list of string)" + "output variable names for reader program"); AddComment(R"DOC( This operator read a file. )DOC"); @@ -119,8 +116,7 @@ class DataReaderOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators::data; -REGISTER_OPERATOR( - data_reader, ops::DataReaderOp, ops::DataReaderOpMaker, - ops::DataReaderInferShape, ops::DataReaderInferVarType) +REGISTER_OPERATOR(data_reader, ops::DataReaderOp, ops::DataReaderOpMaker, + ops::DataReaderInferShape, ops::DataReaderInferVarType) REGISTER_OP_CPU_KERNEL(data_reader, ops::DataReaderCPUKernel) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 7691ee2376fae6..74b03d8a544267 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -13,18 +13,25 @@ // limitations under the License. #pragma once -#include -#include #include +#include #include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef _WIN32 +static unsigned sleep(unsigned seconds) { + Sleep(seconds * 1000); + return 0; +} +#endif namespace paddle { namespace operators { @@ -36,108 +43,94 @@ using BlockDesc = framework::BlockDesc; using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; class Sampler { - public: - explicit Sampler(const int64_t batch_size, const int64_t num_samples, - const bool shuffle, const bool drop_last, - const int64_t seed, const int rank, - const int world_size) - : current_iter_(0), - batch_size_(batch_size), - shuffle_(shuffle), - drop_last_(drop_last), - rank_(rank), - world_size_(world_size) { - int trunc_num_samples; - if (drop_last) { - int total_batch_size = world_size * batch_size; - trunc_num_samples = floor(num_samples / total_batch_size) * total_batch_size; - sample_ids_.reserve(trunc_num_samples); - VLOG(4) << "Sampler trunc sampler num_samples " << trunc_num_samples; - } - else{ - sample_ids_.reserve(num_samples); - trunc_num_samples = num_samples; - } - for (int64_t i = 0; i < trunc_num_samples; i++) { - sample_ids_.emplace_back(i); - } - num_samples_ = sample_ids_.size(); - if (shuffle) { - rnd_.seed(seed); - std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); - } + public: + explicit Sampler(const int64_t batch_size, const int64_t num_samples, + const bool shuffle, const bool drop_last, const int64_t seed, + const int rank, const int world_size) + : current_iter_(0), + batch_size_(batch_size), + shuffle_(shuffle), + drop_last_(drop_last), + rank_(rank), + world_size_(world_size) { + int trunc_num_samples; + if (drop_last) { + int total_batch_size = world_size * batch_size; + trunc_num_samples = + floor(num_samples / total_batch_size) * total_batch_size; + sample_ids_.reserve(trunc_num_samples); + } else { + sample_ids_.reserve(num_samples); + trunc_num_samples = num_samples; + } + for (int64_t i = 0; i < trunc_num_samples; i++) { + sample_ids_.emplace_back(i); } + num_samples_ = sample_ids_.size(); + if (shuffle) { + rnd_.seed(seed); + std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); + } + } - void GetNextIndices(std::vector* indices) { - int64_t start_idx = - batch_size_ * world_size_ * current_iter_ + rank_; - // batch_size_ * world_size_ * current_iter_ + rank_ * batch_size_; - current_iter_++; + void GetNextIndices(std::vector* indices) { + int64_t start_idx = batch_size_ * world_size_ * current_iter_ + rank_; + current_iter_++; - if (start_idx >= num_samples_) { - VLOG(4) << " start idx >= num samples " << start_idx << " >= " << num_samples_; - return; - } + if (start_idx >= num_samples_) return; - for (int64_t i = 0; i < batch_size_; i++) { - int cur_idx = start_idx + i * world_size_; - if (cur_idx >= num_samples_) { - VLOG(4) << " cur_idx >= num samples " << cur_idx << " >= " << num_samples_; - return; - } - indices->emplace_back(sample_ids_[cur_idx]); - } + for (int64_t i = 0; i < batch_size_; i++) { + int cur_idx = start_idx + i * world_size_; + if (cur_idx >= num_samples_) return; + indices->emplace_back(sample_ids_[cur_idx]); } + } - void Reset() { - if (shuffle_) { - std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); - } - - current_iter_ = 0; + void Reset() { + if (shuffle_) { + std::shuffle(sample_ids_.begin(), sample_ids_.end(), rnd_); } - private: - int64_t current_iter_; - const int64_t batch_size_; - const bool shuffle_; - int64_t num_samples_; - const bool drop_last_; - const int rank_; - const int world_size_; - - std::mt19937 rnd_; - std::vector sample_ids_; + current_iter_ = 0; + } + + private: + int64_t current_iter_; + const int64_t batch_size_; + const bool shuffle_; + int64_t num_samples_; + const bool drop_last_; + const int rank_; + const int world_size_; + + std::mt19937 rnd_; + std::vector sample_ids_; }; class DataReader { public: - explicit DataReader(BlockDesc* reader_block, - const Scope* scope, - const platform::Place place, - const std::string &indices_var_name, - const std::vector &output_var_names, - const std::vector> output_queues, - const int batch_size, - const int num_samples, - const bool shuffle, - const bool drop_last, - const int64_t seed, - const int rank, - const int world_size) - : running_(true), - shutdown_(false), - reader_block_(reader_block), - place_(place), - indices_var_name_(indices_var_name), - output_var_names_(output_var_names), - output_queues_(output_queues), - batch_size_(batch_size), - sampler_(batch_size, num_samples, shuffle, - drop_last, seed, rank, world_size) { + explicit DataReader( + BlockDesc* reader_block, const Scope* scope, const platform::Place place, + const std::string& indices_var_name, + const std::vector& output_var_names, + const std::vector> output_queues, + const int batch_size, const int num_samples, const bool shuffle, + const bool drop_last, const int64_t seed, const int rank, + const int world_size) + : running_(true), + shutdown_(false), + reader_block_(reader_block), + place_(place), + indices_var_name_(indices_var_name), + output_var_names_(output_var_names), + output_queues_(output_queues), + batch_size_(batch_size), + sampler_(batch_size, num_samples, shuffle, drop_last, seed, rank, + world_size) { StartReaderThread(scope); } @@ -159,7 +152,7 @@ class DataReader { sampler_.GetNextIndices(&indices); // shutdown reader if indices drained if (indices.size() == 0) { - for(auto& queue: output_queues_) { + for (auto& queue : output_queues_) { while (queue->Size()) sleep(0.5); queue->Close(); } @@ -172,18 +165,19 @@ class DataReader { try { executor.Run(*reader_block_->Program(), &scope_, - static_cast(reader_block_->ID()), - false, true, {}, false, true); + static_cast(reader_block_->ID()), false, true, {}, + false, true); } catch (...) { break; } for (size_t i = 0; i < output_var_names_.size(); i++) { - auto *out_var = scope_.FindVar(output_var_names_[i]); + auto* out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( - "The output variable %s is not found in DataReader " - "program's internal scope", output_var_names_[i])); + "The output variable %s is not found in DataReader " + "program's internal scope", + output_var_names_[i])); // CheckOutputVarStatus(*out_var, output_var_names_[i]); if (out_var->IsType()) { @@ -205,7 +199,7 @@ class DataReader { } void ShutDown() { - for(auto& queue: output_queues_) { + for (auto& queue : output_queues_) { if (queue && !queue->IsClosed()) queue->Close(); } @@ -218,7 +212,7 @@ class DataReader { void Reset() { // reopen all output queues - for (auto& queue: output_queues_) queue->ReOpen(); + for (auto& queue : output_queues_) queue->ReOpen(); // reset sampler to regenerate indices sampler_.Reset(); @@ -228,14 +222,15 @@ class DataReader { running_cond_.notify_all(); } - void ShareIndicesIntoScope(Scope* scope, - std::vector indices) { + void ShareIndicesIntoScope(Scope* scope, std::vector indices) { auto* var = scope->Var(indices_var_name_); auto* indices_tensor = var->GetMutable(); - indices_tensor->Resize(phi::make_ddim({static_cast(indices.size())})); - auto* indices_data = indices_tensor->mutable_data(platform::CPUPlace()); - + indices_tensor->Resize( + phi::make_ddim({static_cast(indices.size())})); + auto* indices_data = + indices_tensor->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < indices.size(); i++) { indices_data[i] = indices[i]; } @@ -268,18 +263,17 @@ class DataReader { } }; - class ReaderManager { private: DISABLE_COPY_AND_ASSIGN(ReaderManager); - static ReaderManager *rm_instance_ptr_; + static ReaderManager* rm_instance_ptr_; static std::mutex m_; std::map> id_to_reader_; public: - static ReaderManager *Instance() { + static ReaderManager* Instance() { if (rm_instance_ptr_ == nullptr) { std::lock_guard lk(m_); if (rm_instance_ptr_ == nullptr) { @@ -290,21 +284,19 @@ class ReaderManager { } void StartDataReader( - const int64_t reader_id, BlockDesc *reader_block, - const Scope* scope, const platform::Place place, - const std::string &indices_var_name, - const std::vector &output_var_names, - const std::vector> &output_queues, + const int64_t reader_id, BlockDesc* reader_block, const Scope* scope, + const platform::Place place, const std::string& indices_var_name, + const std::vector& output_var_names, + const std::vector>& output_queues, const int batch_size, const int num_samples, const bool shuffle, const bool drop_last, const int64_t seed, const int rank, const int world_size) { auto iter = id_to_reader_.find(reader_id); if (iter == id_to_reader_.end()) { - id_to_reader_[reader_id] = std::unique_ptr( - new DataReader(reader_block, scope, place, indices_var_name, - output_var_names, output_queues, batch_size, - num_samples, shuffle, drop_last, seed, - rank, world_size)); + id_to_reader_[reader_id] = std::unique_ptr(new DataReader( + reader_block, scope, place, indices_var_name, output_var_names, + output_queues, batch_size, num_samples, shuffle, drop_last, seed, + rank, world_size)); } } @@ -325,8 +317,8 @@ class ReaderManager { void ShutDown() { auto iter = id_to_reader_.begin(); - while (iter != id_to_reader_.end()){ - if(iter->second.get()){ + while (iter != id_to_reader_.end()) { + if (iter->second.get()) { iter->second.get()->ShutDown(); } iter++; @@ -334,36 +326,33 @@ class ReaderManager { id_to_reader_.clear(); } - ReaderManager() { VLOG(1) << "ReaderManager init"; } + ReaderManager() {} - ~ReaderManager() { - VLOG(1) << "~ReaderManager"; - ShutDown(); - } + ~ReaderManager() { ShutDown(); } }; -static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { +static void CheckAndInitOutputQueue(const std::vector& vars, + int capacity) { for (auto var : vars) { if (var->IsInitialized()) { PADDLE_ENFORCE_EQ(var->IsType(), true, - platform::errors::InvalidArgument( - "Output Variables of DataLoaderOp should hold " - "LoDTensorBlockingQueueHolder type")); + platform::errors::InvalidArgument( + "Output Variables of DataLoaderOp should hold " + "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); if (queue == nullptr) { auto* holder = var->template GetMutable(); holder->InitOnce(capacity); - VLOG(1) << "DataLoaderOpKernel init queue" << holder->GetQueue(); } } else { - VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; auto* holder = var->GetMutable(); holder->InitOnce(capacity); } } } -static std::vector> GetQueueVecFromVariableVec(const std::vector& vars) { +static std::vector> +GetQueueVecFromVariableVec(const std::vector& vars) { std::vector> queues; queues.reserve(vars.size()); for (size_t i = 0; i < vars.size(); i++) { @@ -373,9 +362,9 @@ static std::vector> GetQueueVecFromVaria } template -class DataReaderCPUKernel: public framework::OpKernel { +class DataReaderCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override {} + void Compute(const framework::ExecutionContext& ctx) const override {} }; } // namespace data diff --git a/paddle/fluid/operators/data/dataloader_op.h b/paddle/fluid/operators/data/dataloader_op.h index ff64c38a55b134..59273d463f759b 100644 --- a/paddle/fluid/operators/data/dataloader_op.h +++ b/paddle/fluid/operators/data/dataloader_op.h @@ -11,8 +11,8 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/operators/data/pipeline.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -39,7 +39,6 @@ class DataLoaderOpKernel : public framework::OpKernel { pipeline->ReadNext(output_vars); if (!pipeline->IsRunning()) { - VLOG(4) << "DataLoaderOpKernel Pipeline not running, throw EOF"; PADDLE_THROW_EOF(); } } diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc index 530d51ec35d358..45b41ec9434c69 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.cc +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -27,9 +27,6 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, platform::errors::InvalidArgument( "Input(Indices) of ReadFileLoaderOp is null.")); - // PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true, - // platform::errors::InvalidArgument( - // "Output(Image) of ReadFileLoaderOp is null.")); PADDLE_ENFORCE_EQ(ctx->HasOutput("Label"), true, platform::errors::InvalidArgument( "Output(Label) of ReadFileLoaderOp is null.")); diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h index 995c410b7966ed..ceba58293ef8a3 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.h +++ b/paddle/fluid/operators/data/file_label_loader_op.h @@ -117,7 +117,6 @@ static std::vector>* GetFilesAndLabelsFromCache( if (iter == root_to_samples_.end()) { std::vector> samples; ParseFilesAndLabels(data_root, &samples); - VLOG(4) << "Init sample number: " << samples.size(); root_to_samples_[data_root] = samples; } diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc index 050016c11d7ce5..975c05473774e6 100644 --- a/paddle/fluid/operators/data/image_decoder.cc +++ b/paddle/fluid/operators/data/image_decoder.cc @@ -95,7 +95,6 @@ void ImageDecoder::CPUDecodeRandomCrop(const uint8_t* data, size_t length, size_t workspace_size, framework::LoDTensor* out, platform::Place place) { - VLOG(4) << "CPUDecodeRandomCropResize enter"; #ifdef PADDLE_WITH_OPENCV cv::Mat image = cv::imdecode( cv::Mat(1, length, CV_8UC1, const_cast(data)), diff --git a/paddle/fluid/operators/data/image_decoder.h b/paddle/fluid/operators/data/image_decoder.h index dd2513e8358abb..6882b7c9364901 100644 --- a/paddle/fluid/operators/data/image_decoder.h +++ b/paddle/fluid/operators/data/image_decoder.h @@ -185,14 +185,9 @@ class ImageDecoderThreadPoolManager { } } - ImageDecoderThreadPoolManager() { - VLOG(1) << "ImageDecoderThreadPoolManager init"; - } + ImageDecoderThreadPoolManager() {} - ~ImageDecoderThreadPoolManager() { - VLOG(1) << "~DecoderThreadPoolManager"; - ShutDown(); - } + ~ImageDecoderThreadPoolManager() { ShutDown(); } }; } // namespace data diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index ce1778ac710bdd..a79e9f0aa216af 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -20,8 +20,7 @@ using framework::Tensor; class MapOp : public framework::OperatorBase { public: - MapOp(const std::string& type, - const framework::VariableNameMap& inputs, + MapOp(const std::string& type, const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} @@ -39,7 +38,7 @@ class MapOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { + const platform::Place& dev_place) const override { // Step1: get output vars and attrs auto inputs = Inputs("In"); std::vector input_vars; @@ -66,9 +65,8 @@ class MapOp : public framework::OperatorBase { auto input_queues = GetQueueVecFromVariableVec(input_vars); auto output_queues = GetQueueVecFromVariableVec(output_vars); data::MapRunnerManager::Instance()->StartMapRunner( - map_block, program_id, &scope, dev_place, - input_var_names, output_var_names, - input_queues, output_queues); + map_block, program_id, &scope, dev_place, input_var_names, + output_var_names, input_queues, output_queues); } }; @@ -89,7 +87,7 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("In", "(LoDTensorBlockingQueueHolder)" - "The output tensors of Map operator") + "The output tensors of Map operator") .AsDuplicable(); AddOutput("Out", "(LoDTensorBlockingQueueHolder)" @@ -104,11 +102,11 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "The unique hash id used as cache key for " "ExecutorInfoCache"); AddAttr>("input_var_names", - "(list of string)" - "input variable names for map program"); + "(list of string)" + "input variable names for map program"); AddAttr>("output_var_names", - "(list of string)" - "output variable names for map program"); + "(list of string)" + "output variable names for map program"); AddComment(R"DOC( Map Op )DOC"); @@ -119,6 +117,7 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(map, ops::MapOp, ops::MapOpMaker, - ops::MapInferShape, ops::MapInferVarType); -REGISTER_OP_CPU_KERNEL(map, ops::MapOpKernel); +REGISTER_OPERATOR(map, ops::MapOp, ops::MapOpMaker, ops::MapInferShape, + ops::MapInferVarType); +REGISTER_OP_CPU_KERNEL( + map, ops::MapOpKernel); diff --git a/paddle/fluid/operators/data/map_op.cu.cc b/paddle/fluid/operators/data/map_op.cu.cc index 7f931b2a1281b2..ac9009bfced1e0 100644 --- a/paddle/fluid/operators/data/map_op.cu.cc +++ b/paddle/fluid/operators/data/map_op.cu.cc @@ -16,5 +16,4 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( - map, - ops::MapOpKernel); + map, ops::MapOpKernel); diff --git a/paddle/fluid/operators/data/map_op.h b/paddle/fluid/operators/data/map_op.h index 9ca34671f882cc..7a2266d06c57aa 100644 --- a/paddle/fluid/operators/data/map_op.h +++ b/paddle/fluid/operators/data/map_op.h @@ -11,52 +11,52 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/data/map_runner.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { namespace operators { using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; - +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; static void CheckInputQueueStatus(const std::vector& vars) { for (auto var : vars) { PADDLE_ENFORCE_EQ(var->IsType(), true, - platform::errors::InvalidArgument( - "Input Variables of MapOp should hold " - "LoDTensorBlockingQueueHolder type")); + platform::errors::InvalidArgument( + "Input Variables of MapOp should hold " + "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); PADDLE_ENFORCE_NE(queue, nullptr, - platform::errors::InvalidArgument( - "Input LoDTensorBlockingQueue is not initialized")); + platform::errors::InvalidArgument( + "Input LoDTensorBlockingQueue is not initialized")); } } -static void CheckAndInitOutputQueue(const std::vector& vars, int capacity) { +static void CheckAndInitOutputQueue(const std::vector& vars, + int capacity) { for (auto var : vars) { if (var->IsInitialized()) { PADDLE_ENFORCE_EQ(var->IsType(), true, - platform::errors::InvalidArgument( - "Output Variables of MapOp should hold " - "LoDTensorBlockingQueueHolder type")); + platform::errors::InvalidArgument( + "Output Variables of MapOp should hold " + "LoDTensorBlockingQueueHolder type")); auto queue = var->Get().GetQueue(); if (queue == nullptr) { auto* holder = var->template GetMutable(); holder->InitOnce(capacity); - VLOG(1) << "MapOpKernel init queue" << holder->GetQueue(); } } else { - VLOG(1) << "Initialize Output LoDTensorBlockingQueue capacity " << capacity; auto* holder = var->GetMutable(); holder->InitOnce(capacity); } } } -static std::vector> GetQueueVecFromVariableVec(const std::vector& vars) { +static std::vector> +GetQueueVecFromVariableVec(const std::vector& vars) { std::vector> queues; queues.reserve(vars.size()); for (size_t i = 0; i < vars.size(); i++) { diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index 4e043b60e26f96..a1e7cf190d4efc 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -11,20 +11,25 @@ #include -#include "paddle/fluid/operators/data/map_runner.h" #include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/operators/data/map_runner.h" + +#ifdef _WIN32 +static unsigned sleep(unsigned seconds) { + Sleep(seconds * 1000); + return 0; +} +#endif namespace paddle { namespace operators { namespace data { MapRunner::MapRunner( - const std::shared_ptr map_block, - const int64_t program_id, - const Scope* scope, - const platform::Place &place, - const std::vector &input_var_names, - const std::vector &output_var_names, + const std::shared_ptr map_block, const int64_t program_id, + const Scope* scope, const platform::Place& place, + const std::vector& input_var_names, + const std::vector& output_var_names, const std::vector> input_queues, const std::vector> output_queues) : running_(true), @@ -36,20 +41,19 @@ MapRunner::MapRunner( output_var_names_(output_var_names), input_queues_(input_queues), output_queues_(output_queues) { - VLOG(1) << "MapRunner init"; - - PADDLE_ENFORCE_EQ(input_var_names_.size(), input_queues_.size(), - platform::errors::InvalidArgument( - "input_var_names length should be equal to input_queues length, " - "but recieve %d != %d.", - input_var_names_.size(), - input_queues_.size())); - PADDLE_ENFORCE_EQ(output_var_names_.size(), output_queues_.size(), - platform::errors::InvalidArgument( - "output_var_names length should be equal to output_queues length, " - "but recieve %d != %d.", - output_var_names_.size(), - output_queues_.size())); + + PADDLE_ENFORCE_EQ( + input_var_names_.size(), input_queues_.size(), + platform::errors::InvalidArgument( + "input_var_names length should be equal to input_queues length, " + "but recieve %d != %d.", + input_var_names_.size(), input_queues_.size())); + PADDLE_ENFORCE_EQ( + output_var_names_.size(), output_queues_.size(), + platform::errors::InvalidArgument( + "output_var_names length should be equal to output_queues length, " + "but recieve %d != %d.", + output_var_names_.size(), output_queues_.size())); StartMapThread(scope); } @@ -69,7 +73,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // input array length = 1, treat input type as LoDTensor // FIXME(dkp): this may incur error if batch size = 1 auto tensor = tensor_arr[0]; - if (!tensor.IsInitialized()) return false; + if (!tensor.IsInitialized()) return false; // get dst variable from scope and check status auto name = input_var_names_[i]; @@ -81,8 +85,8 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { dst_tensor->set_lod(tensor.lod()); } else { // input array length > 1 treat input type as LoDTensorArray - for (auto tensor: tensor_arr) { - if (!tensor.IsInitialized()) return false; + for (auto tensor : tensor_arr) { + if (!tensor.IsInitialized()) return false; } // get dst variable from scope and check status @@ -91,7 +95,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { // share input tensor to dst variable auto& dst_tensor_arr = *(var->GetMutable()); - for (auto &tensor: dst_tensor_arr) tensor.clear(); + for (auto& tensor : dst_tensor_arr) tensor.clear(); dst_tensor_arr.clear(); dst_tensor_arr.reserve(tensor_arr.size()); for (size_t i = 0; i < tensor_arr.size(); i++) { @@ -102,10 +106,7 @@ bool MapRunner::ShareInputsIntoScope(Scope* scope) { return true; } -void signal_handler(int sig_num) { - VLOG(1) << "MapThread crash with signal " << sig_num; - _exit(-1); -} +void signal_handler(int sig_num) { _exit(-1); } void MapRunner::StartMapThread(const Scope* scope) { map_thread_ = std::thread([this, scope]() -> void { @@ -125,8 +126,8 @@ void MapRunner::StartMapThread(const Scope* scope) { // Step 1: get input LoDTensor and share into Scope bool success = ShareInputsIntoScope(&scope_); if (!success) { - for(auto& queue : output_queues_) { - while(queue->Size()) sleep(0.5); + for (auto& queue : output_queues_) { + while (queue->Size()) sleep(0.5); queue->Close(); } running_ = false; @@ -135,15 +136,17 @@ void MapRunner::StartMapThread(const Scope* scope) { // Step 2: run ops by executor without fetch try { - executor.Run(*map_block_->Program(), &scope_, static_cast(map_block_->ID()), false, true, std::vector(), false, true); - } catch(...) { + executor.Run(*map_block_->Program(), &scope_, + static_cast(map_block_->ID()), false, true, + std::vector(), false, true); + } catch (...) { break; } // Step 3: fetch output variable to LoDTensor vector // and push to output queue for (size_t i = 0; i < output_var_names_.size(); i++) { - auto *out_var = scope_.FindVar(output_var_names_[i]); + auto* out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( "The output variable %s is not found in Map " @@ -169,8 +172,8 @@ void MapRunner::StartMapThread(const Scope* scope) { }); } -void MapRunner::CheckOutputVarStatus(const Variable &var, - const std::string &var_name) { +void MapRunner::CheckOutputVarStatus(const Variable& var, + const std::string& var_name) { // only LoDTensor & LoDTensorArray variable type support currently if (var.IsType()) { PADDLE_ENFORCE_EQ(var.Get().IsInitialized(), true, @@ -180,24 +183,24 @@ void MapRunner::CheckOutputVarStatus(const Variable &var, var_name)); } else if (var.IsType()) { auto tensor_array = var.Get(); - for (auto tensor: tensor_array) { + for (auto tensor : tensor_array) { PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true, platform::errors::InvalidArgument( - "The tensor in LoDTensorArray of output " - "variable %s get from Map program's internal " - "scope is not initialized.", var_name)); + "The tensor in LoDTensorArray of output " + "variable %s get from Map program's internal " + "scope is not initialized.", + var_name)); } } else { PADDLE_THROW(platform::errors::InvalidArgument( - "MapOp can only support LoDTensor or LoDTensorArray")); + "MapOp can only support LoDTensor or LoDTensorArray")); } } void MapRunner::ShutDown() { - VLOG(1) << "MapRunner shutdown " << program_id_; // close all output queue, op after this op can shutdown itself - for (auto queue : output_queues_) { - if(queue && !queue->IsClosed()) queue->Close(); + for (auto queue : output_queues_) { + if (queue && !queue->IsClosed()) queue->Close(); } shutdown_ = true; @@ -208,15 +211,14 @@ void MapRunner::ShutDown() { } void MapRunner::Reset() { - VLOG(1) << "MapRunner reset " << program_id_; - for (auto queue : output_queues_) queue->ReOpen(); + for (auto queue : output_queues_) queue->ReOpen(); running_ = true; running_cond_.notify_all(); } // initialization static variables out of MapRunnerManager -MapRunnerManager *MapRunnerManager::pm_instance_ptr_ = nullptr; +MapRunnerManager* MapRunnerManager::pm_instance_ptr_ = nullptr; std::mutex MapRunnerManager::m_; } // data diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index b110e67d769b0d..c1e23436480512 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -29,25 +29,22 @@ using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; using LoDTensorArray = framework::LoDTensorArray; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; namespace data { class MapRunner { public: - MapRunner(const std::shared_ptr map_block, - const int64_t program_id, - const Scope* scope, - const platform::Place &place, - const std::vector &input_var_names, - const std::vector &output_var_names, - const std::vector> input_queues, - const std::vector> output_queues); - - ~MapRunner() { - VLOG(1) << "~MapRunner"; - ShutDown(); - } + MapRunner( + const std::shared_ptr map_block, const int64_t program_id, + const Scope *scope, const platform::Place &place, + const std::vector &input_var_names, + const std::vector &output_var_names, + const std::vector> input_queues, + const std::vector> output_queues); + + ~MapRunner() { ShutDown(); } void ShutDown(); @@ -55,7 +52,6 @@ class MapRunner { inline bool IsRunning() { return running_; } - private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { @@ -65,9 +61,9 @@ class MapRunner { out_tensor.set_lod(lod_tensor.lod()); } - bool ShareInputsIntoScope(Scope* scope); + bool ShareInputsIntoScope(Scope *scope); - void StartMapThread(const Scope* scope); + void StartMapThread(const Scope *scope); void CheckInputVarStatus(const Variable &var, const std::string &var_name); void CheckOutputVarStatus(const Variable &var, const std::string &var_name); @@ -111,25 +107,26 @@ class MapRunnerManager { } void StartMapRunner( - BlockDesc *map_block, const int64_t program_id, - const Scope* scope, const platform::Place &place, + BlockDesc *map_block, const int64_t program_id, const Scope *scope, + const platform::Place &place, const std::vector &input_var_names, const std::vector &output_var_names, const std::vector> &input_queues, - const std::vector> &output_queues) { + const std::vector> + &output_queues) { auto iter = prog_id_to_runner_.find(program_id); if (iter == prog_id_to_runner_.end()) { prog_id_to_runner_[program_id] = std::unique_ptr(new MapRunner( std::shared_ptr(map_block), program_id, scope, place, input_var_names, output_var_names, input_queues, output_queues)); - } + } } void ShutDownMapRunner(int program_id) { std::lock_guard lk(m_); auto iter = prog_id_to_runner_.find(program_id); if (iter != prog_id_to_runner_.end()) { - if(iter->second.get()) iter->second.get()->ShutDown(); + if (iter->second.get()) iter->second.get()->ShutDown(); prog_id_to_runner_.erase(iter); } } @@ -144,7 +141,7 @@ class MapRunnerManager { void ShutDown() { if (prog_id_to_runner_.empty()) return; - + std::lock_guard lk(m_); auto iter = prog_id_to_runner_.begin(); for (; iter != prog_id_to_runner_.end(); iter++) { @@ -152,12 +149,9 @@ class MapRunnerManager { } } - MapRunnerManager() { VLOG(1) << "MapRunnerManager init"; } + MapRunnerManager() {} - ~MapRunnerManager() { - VLOG(1) << "~MapRunnerManager"; - ShutDown(); - } + ~MapRunnerManager() { ShutDown(); } }; } // data diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cc b/paddle/fluid/operators/data/mirror_normalize_op.cc index 638a1cc68bb461..183d9015008f3f 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cc +++ b/paddle/fluid/operators/data/mirror_normalize_op.cc @@ -23,31 +23,36 @@ class MirrorNormalizeOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of MirrorNormalizeOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::NotFound( + "Input(X) of MirrorNormalizeOp should not be null.")); PADDLE_ENFORCE_EQ( ctx->HasInput("Mirror"), true, - platform::errors::NotFound("Input(Mirror) of MirrorNormalizeOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of MirrorNormalizeOp should not be null.")); + platform::errors::NotFound( + "Input(Mirror) of MirrorNormalizeOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("Out"), true, + platform::errors::NotFound( + "Output(Out) of MirrorNormalizeOp should not be null.")); auto x_dims = ctx->GetInputDim("X"); if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::NotFound( - "Input(X) of MirrorNormalizeOp should be a 4-D Tensor")); + PADDLE_ENFORCE_EQ( + x_dims.size(), 4, + platform::errors::NotFound( + "Input(X) of MirrorNormalizeOp should be a 4-D Tensor")); auto c = x_dims[1]; auto mean = ctx->Attrs().Get>("mean"); auto std = ctx->Attrs().Get>("std"); - PADDLE_ENFORCE_EQ(mean.size(), c, - platform::errors::NotFound( - "The channel number of Input(X) should equal to length of mean")); - PADDLE_ENFORCE_EQ(mean.size(), c, - platform::errors::NotFound( - "The channel number of Input(X) should equal to length of mean")); + PADDLE_ENFORCE_EQ( + mean.size(), c, + platform::errors::NotFound( + "The channel number of Input(X) should equal to length of mean")); + PADDLE_ENFORCE_EQ( + mean.size(), c, + platform::errors::NotFound( + "The channel number of Input(X) should equal to length of mean")); } std::vector output_dims(x_dims.size()); @@ -69,10 +74,12 @@ class MirrorNormalizeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor), The input tensor of mirror_normalize op."); - AddInput("Mirror", "(Tensor), The mirror vector for random flip, the " - "shape is {N, 1}, N is the batch size of input X"); - AddOutput("Out", "(Tensor), The output tensor in the same shape as " - "input X."); + AddInput("Mirror", + "(Tensor), The mirror vector for random flip, the " + "shape is {N, 1}, N is the batch size of input X"); + AddOutput("Out", + "(Tensor), The output tensor in the same shape as " + "input X."); AddAttr>("mean", "The mean value to normalize data"); AddAttr>("std", "The stdvalue to normalize data"); AddComment(R"DOC( @@ -81,7 +88,8 @@ class MirrorNormalizeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class MirrorNormalizeOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { +class MirrorNormalizeOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { protected: std::unordered_map& GetInputOutputWithSameType() const override { @@ -96,8 +104,9 @@ class MirrorNormalizeOpInferVarType : public framework::PassInDtypeAndVarTypeToO namespace ops = paddle::operators::data; namespace plat = paddle::platform; -REGISTER_OPERATOR(mirror_normalize, ops::MirrorNormalizeOp, ops::MirrorNormalizeOpMaker, ops::MirrorNormalizeOpInferVarType); +REGISTER_OPERATOR(mirror_normalize, ops::MirrorNormalizeOp, + ops::MirrorNormalizeOpMaker, + ops::MirrorNormalizeOpInferVarType); -REGISTER_OP_CPU_KERNEL( - mirror_normalize, ops::MirrorNormalizeCPUKernel, - ops::MirrorNormalizeCPUKernel); +REGISTER_OP_CPU_KERNEL(mirror_normalize, ops::MirrorNormalizeCPUKernel, + ops::MirrorNormalizeCPUKernel); diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cu b/paddle/fluid/operators/data/mirror_normalize_op.cu index d940da3982468a..8bca0640d75991 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cu +++ b/paddle/fluid/operators/data/mirror_normalize_op.cu @@ -23,16 +23,16 @@ namespace data { using framework::LoDTensor; -template -__global__ void KeMirrorNormalize( - const int numel, const T* in_data, const bool* mirrors, T* out_data, - const float* mean, const float* std, const int chw, const int hw, - const int w) { +template +__global__ void KeMirrorNormalize(const int numel, const T* in_data, + const bool* mirrors, T* out_data, + const float* mean, const float* std, + const int chw, const int hw, const int w) { CUDA_KERNEL_LOOP(idx, numel) { int ni = idx / chw; int ci = (idx % chw) / hw; int wi = idx % w; - + int out_idx = idx; if (mirrors[ni]) out_idx = idx - 2 * wi + w - 1; out_data[out_idx] = (in_data[idx] - mean[ci]) / std[ci]; @@ -44,7 +44,7 @@ class MirrorNormalizeCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); - auto* mirror = ctx.Input("Mirror"); + auto* mirror = ctx.Input("Mirror"); auto* out = ctx.Output("Out"); auto mean = ctx.Attr>("mean"); @@ -76,9 +76,9 @@ class MirrorNormalizeCUDAKernel : public framework::OpKernel { dev_ctx.stream()); platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, numel); - KeMirrorNormalize<<>>( + platform::GetGpuLaunchConfig1D(dev_ctx, numel); + KeMirrorNormalize<<>>( numel, x_data, mirror_data, out_data, mean_data, std_data, chw, hw, w); } }; diff --git a/paddle/fluid/operators/data/pipeline.cc b/paddle/fluid/operators/data/pipeline.cc index c28517b3bb4e7c..216b41ea6da274 100644 --- a/paddle/fluid/operators/data/pipeline.cc +++ b/paddle/fluid/operators/data/pipeline.cc @@ -27,8 +27,6 @@ Pipeline::Pipeline(const std::shared_ptr global_block, end_op_index_(end_op_index), program_id_(program_id), output_var_names_(output_var_names) { - VLOG(1) << "Pipeline init"; - PADDLE_ENFORCE_GT(end_op_index_, start_op_index_, platform::errors::InvalidArgument( "end_op_index should be greater than start_op_index, " @@ -78,11 +76,12 @@ void Pipeline::CheckOutputVarStatus(const Variable &var, } void Pipeline::ReadNext(std::vector &out_vars) { - PADDLE_ENFORCE_EQ(out_vars.size(), output_var_names_.size(), - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ( + out_vars.size(), output_var_names_.size(), + platform::errors::InvalidArgument( "Out variable number should equal to output variable name " - "number, but receive %d != %d", out_vars.size(), - output_var_names_.size())); + "number, but receive %d != %d", + out_vars.size(), output_var_names_.size())); for (size_t i = 0; i < output_var_names_.size(); i++) { auto *out_var = scope_.FindVar(output_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( @@ -98,12 +97,13 @@ void Pipeline::ReadNext(std::vector &out_vars) { bool success = true; auto outputs = out_queue->Pop(&success); - PADDLE_ENFORCE_EQ(success, true, - platform::errors::PreconditionNotMet("Read from output queue %s failed", output_var_names_[i])); - + PADDLE_ENFORCE_EQ(success, true, platform::errors::PreconditionNotMet( + "Read from output queue %s failed", + output_var_names_[i])); + CheckOutputVarStatus(*(out_vars[i]), output_var_names_[i]); copy_tensor(outputs.at(0), out_vars[i]->GetMutable()); - for (auto &output: outputs) output.clear(); + for (auto &output : outputs) output.clear(); outputs.clear(); } } diff --git a/paddle/fluid/operators/data/pipeline.h b/paddle/fluid/operators/data/pipeline.h index 8215473bfcb3bf..0c673b52eb785a 100644 --- a/paddle/fluid/operators/data/pipeline.h +++ b/paddle/fluid/operators/data/pipeline.h @@ -29,7 +29,8 @@ using ParallelExecutor = framework::ParallelExecutor; using Variable = framework::Variable; using LoDTensor = framework::LoDTensor; using LoDTensorBlockingQueue = operators::reader::LoDTensorBlockingQueue; -using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder; +using LoDTensorBlockingQueueHolder = + operators::reader::LoDTensorBlockingQueueHolder; namespace data { @@ -39,9 +40,8 @@ class Pipeline { const platform::Place &place, int64_t start_op_index, int64_t end_op_index, int64_t program_id, const std::vector &output_var_names); - // size_t prefetch_queue_size); - ~Pipeline() { VLOG(1) << "~Pipeline"; } + ~Pipeline() {} void ReadNext(std::vector &out_vars); @@ -50,19 +50,18 @@ class Pipeline { void Reset() { running_.store(true); } private: - void CheckOutputVarStatus(const Variable &var, const std::string &var_name); - void copy_tensor(const framework::LoDTensor& lod_tensor, - framework::LoDTensor* out) const { + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { if (lod_tensor.numel() == 0) return; - auto& out_tensor = *out; + auto &out_tensor = *out; framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } std::atomic running_; - + Scope scope_; std::shared_ptr global_block_; platform::Place place_; @@ -71,7 +70,6 @@ class Pipeline { int64_t program_id_; std::vector output_var_names_; - }; class PipelineManager { @@ -96,10 +94,10 @@ class PipelineManager { return pm_instance_ptr_; } - Pipeline* GetPipeline( - int64_t program_id, BlockDesc *global_block, const platform::Place &place, - int64_t start_op_index, int64_t end_op_index, - const std::vector &output_var_names) { + Pipeline *GetPipeline(int64_t program_id, BlockDesc *global_block, + const platform::Place &place, int64_t start_op_index, + int64_t end_op_index, + const std::vector &output_var_names) { auto iter = prog_id_to_pipeline_.find(program_id); if (iter == prog_id_to_pipeline_.end()) { prog_id_to_pipeline_[program_id] = std::unique_ptr(new Pipeline( @@ -122,16 +120,11 @@ class PipelineManager { } } - void ShutDown() { - prog_id_to_pipeline_.clear(); - } + void ShutDown() { prog_id_to_pipeline_.clear(); } - PipelineManager() { VLOG(1) << "PipelineManager init"; } + PipelineManager() {} - ~PipelineManager() { - VLOG(1) << "~PipelineManager"; - ShutDown(); - } + ~PipelineManager() { ShutDown(); } }; } // data diff --git a/paddle/fluid/operators/data/random_roi_generator.cc b/paddle/fluid/operators/data/random_roi_generator.cc index 8bdb225f724254..af512218a39980 100644 --- a/paddle/fluid/operators/data/random_roi_generator.cc +++ b/paddle/fluid/operators/data/random_roi_generator.cc @@ -18,20 +18,20 @@ namespace paddle { namespace operators { namespace data { -RandomROIGenerator::RandomROIGenerator( - AspectRatioRange aspect_ratio_range, AreaRange area_range, - int64_t seed, int64_t num_attempts) +RandomROIGenerator::RandomROIGenerator(AspectRatioRange aspect_ratio_range, + AreaRange area_range, int64_t seed, + int64_t num_attempts) : aspect_ratio_range_(aspect_ratio_range), area_range_(area_range), random_generator_(seed), seed_(seed), num_attempts_(num_attempts) {} -void RandomROIGenerator::GenerateRandomROI( - const int64_t width, const int64_t height, ROI* roi) { +void RandomROIGenerator::GenerateRandomROI(const int64_t width, + const int64_t height, ROI* roi) { if (width <= 0 || height <= 0) return; - - float min_wh_ratio = aspect_ratio_range_.first; + + float min_wh_ratio = aspect_ratio_range_.first; float max_wh_ratio = aspect_ratio_range_.second; float max_hw_ratio = 1 / aspect_ratio_range_.first; float min_area = width * height * area_distribution_.a(); @@ -53,19 +53,16 @@ void RandomROIGenerator::GenerateRandomROI( float roi_area = scale * height * width; // calc ROI width/height - float ratio = std::exp( - aspect_ratio_distribution_(random_generator_)); - auto w = static_cast( - std::roundf(sqrtf(roi_area * ratio))); - auto h = static_cast( - std::roundf(sqrtf(roi_area / ratio))); + float ratio = std::exp(aspect_ratio_distribution_(random_generator_)); + auto w = static_cast(std::roundf(sqrtf(roi_area * ratio))); + auto h = static_cast(std::roundf(sqrtf(roi_area / ratio))); w = std::max(1, w); h = std::max(1, h); // check restrictions ratio = static_cast(w) / h; - if (w <= width && h <= height - && ratio >= min_wh_ratio && ratio <= max_hw_ratio) { + if (w <= width && h <= height && ratio >= min_wh_ratio && + ratio <= max_hw_ratio) { roi->w = w; roi->h = h; break; @@ -93,9 +90,9 @@ void RandomROIGenerator::GenerateRandomROI( // generate random left top coordination x, y roi->x = std::uniform_int_distribution( - 0, width - roi->w)(random_generator_); + 0, width - roi->w)(random_generator_); roi->y = std::uniform_int_distribution( - 0, height - roi->h)(random_generator_); + 0, height - roi->h)(random_generator_); } } diff --git a/paddle/fluid/operators/data/random_roi_generator.h b/paddle/fluid/operators/data/random_roi_generator.h index 57bfb93ed09e9c..88b65045826381 100644 --- a/paddle/fluid/operators/data/random_roi_generator.h +++ b/paddle/fluid/operators/data/random_roi_generator.h @@ -14,12 +14,12 @@ limitations under the License. */ #pragma once -#include +#include +#include +#include #include #include -#include -#include -#include +#include namespace paddle { namespace operators { @@ -38,67 +38,64 @@ struct ROI { }; class RandomROIGenerator { - public: - explicit RandomROIGenerator( - AspectRatioRange aspect_ratio_range, AreaRange area_range, - int64_t seed = time(0), int64_t num_attempts = 10); - - void GenerateRandomROI(const int64_t width, const int64_t height, ROI* roi); + public: + explicit RandomROIGenerator(AspectRatioRange aspect_ratio_range, + AreaRange area_range, int64_t seed = time(0), + int64_t num_attempts = 10); - private: + void GenerateRandomROI(const int64_t width, const int64_t height, ROI* roi); - AspectRatioRange aspect_ratio_range_; - AreaRange area_range_; + private: + AspectRatioRange aspect_ratio_range_; + AreaRange area_range_; - std::uniform_real_distribution aspect_ratio_distribution_; - std::uniform_real_distribution area_distribution_; - std::mt19937 random_generator_; + std::uniform_real_distribution aspect_ratio_distribution_; + std::uniform_real_distribution area_distribution_; + std::mt19937 random_generator_; - int64_t seed_; - int64_t num_attempts_; + int64_t seed_; + int64_t num_attempts_; }; class GeneratorManager { using Generators = std::vector>; - private: + private: + static GeneratorManager* gm_instance_ptr_; + static std::mutex m_; - static GeneratorManager* gm_instance_ptr_; - static std::mutex m_; + std::map> prog_id_to_generators_; - std::map> prog_id_to_generators_; - - public: - static GeneratorManager *Instance() { + public: + static GeneratorManager* Instance() { + if (gm_instance_ptr_ == nullptr) { + std::lock_guard lk(m_); if (gm_instance_ptr_ == nullptr) { - std::lock_guard lk(m_); - if (gm_instance_ptr_ == nullptr) { - gm_instance_ptr_ = new GeneratorManager; - } + gm_instance_ptr_ = new GeneratorManager; } - return gm_instance_ptr_; } - - Generators* GetGenerators(const int64_t program_id, const int batch_size, - const AspectRatioRange aspect_ratio_range, - const AreaRange area_range) { - auto iter = prog_id_to_generators_.find(program_id); - if (iter == prog_id_to_generators_.end()) { - prog_id_to_generators_[program_id] = std::unique_ptr( - new Generators(batch_size)); - - std::seed_seq rand_seq{static_cast(time(0))}; - std::vector rands(batch_size); - rand_seq.generate(rands.begin(), rands.end()); - - for (int i = 0; i < batch_size; i++) { - prog_id_to_generators_[program_id]->at(i).reset( - new RandomROIGenerator(aspect_ratio_range, - area_range, rands[i])); - } + return gm_instance_ptr_; + } + + Generators* GetGenerators(const int64_t program_id, const int batch_size, + const AspectRatioRange aspect_ratio_range, + const AreaRange area_range) { + auto iter = prog_id_to_generators_.find(program_id); + if (iter == prog_id_to_generators_.end()) { + prog_id_to_generators_[program_id] = + std::unique_ptr(new Generators(batch_size)); + + std::seed_seq rand_seq{static_cast(time(0))}; + std::vector rands(batch_size); + rand_seq.generate(rands.begin(), rands.end()); + + for (int i = 0; i < batch_size; i++) { + prog_id_to_generators_[program_id]->at(i).reset( + new RandomROIGenerator(aspect_ratio_range, area_range, rands[i])); } - return prog_id_to_generators_[program_id].get(); - } + } + return prog_id_to_generators_[program_id].get(); + } }; } // namespace data diff --git a/paddle/fluid/operators/data/utils.h b/paddle/fluid/operators/data/utils.h index ed7db01d68e265..5d609db8bbf9e0 100644 --- a/paddle/fluid/operators/data/utils.h +++ b/paddle/fluid/operators/data/utils.h @@ -14,39 +14,45 @@ #pragma once #include "paddle/fluid/operators/data/data_reader_op.h" -#include "paddle/fluid/operators/data/image_decoder.h" #include "paddle/fluid/operators/data/map_runner.h" #include "paddle/fluid/operators/data/pipeline.h" +#ifdef PADDLE_WITH_GPU +#include "paddle/fluid/operators/data/image_decoder.h" +#endif namespace paddle { namespace operators { namespace data { +#ifdef PADDLE_WITH_GPU extern ImageDecoderThreadPool* decode_pool; +#endif void ShutDownAllDataLoaders() { - VLOG(4) << "ShutDownAllDataLoaders enter"; // step 1: shutdown reader ReaderManager::Instance()->ShutDown(); +#ifdef PADDLE_WITH_GPU // step 2: shutdown decoder if (decode_pool) decode_pool->ShutDown(); +#endif // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - // // step 3: shutdown Pipeline - // PipelineManager::Instance()->ShutDown(); - VLOG(4) << "ShutDownAllDataLoaders Pipeline shutdown finish"; + // step 3: shutdown Pipeline + PipelineManager::Instance()->ShutDown(); } void ShutDownReadersAndDecoders(const int64_t program_id) { // step 1: shutdown reader ReaderManager::Instance()->ShutDownReader(program_id); +#ifdef PADDLE_WITH_GPU // step 2: shutdown decoder ImageDecoderThreadPoolManager::Instance()->ShutDownDecoder(program_id); +#endif } void ShutDownPipeline(const int64_t program_id) { @@ -56,9 +62,6 @@ void ShutDownPipeline(const int64_t program_id) { void ResetDataLoader(const int64_t reader_id, const std::vector map_ids, const int64_t pipeline_id) { - VLOG(4) << "ResetDataLoader enter, reader_id: " << reader_id \ - << ", map_ids size: " << map_ids.size() << ", pipeline_id: " \ - << pipeline_id; // step 1: reset readers ReaderManager::Instance()->ResetReader(reader_id); @@ -69,7 +72,6 @@ void ResetDataLoader(const int64_t reader_id, // step3: reset pipeline PipelineManager::Instance()->ResetPipeline(pipeline_id); - VLOG(4) << "ResetDataLoader finish"; } } // namespace data diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 8437e857374cd5..dc77bfbface1fe 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -170,28 +170,26 @@ class SplitLoDTensorInferShape : public framework::InferShapeBase { OP_INOUT_CHECK(context->HasOutput("OutFalse"), "Output", "OutFalse", "SplitLoDTensor"); - if (context->IsRuntime()) { - auto mask_dim = context->GetInputDim("Mask"); - PADDLE_ENFORCE_EQ( - mask_dim.size(), 2, - platform::errors::InvalidArgument( - "If you are using IfElse OP:" - "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " - "ie.true_block():\n out_1 = ie.input(x)\n\n" - "Please ensure that the cond should be a 2-D tensor and " - "the second dim size of cond should be 1. " - "But now the cond's shape is [", - *mask_dim.Get(), "].\n")); - PADDLE_ENFORCE_EQ(mask_dim[1], 1, - platform::errors::InvalidArgument( - "If you are using IfElse OP:" - "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " - "ie.true_block():\n out_1 = ie.input(x)\n\n" - "Please ensure that the cond should be a 2-D tensor " - "and the second dim size of cond should be 1. " - "But now the cond's shape is [", - *mask_dim.Get(), "].\n")); - } + auto mask_dim = context->GetInputDim("Mask"); + PADDLE_ENFORCE_EQ( + mask_dim.size(), 2, + platform::errors::InvalidArgument( + "If you are using IfElse OP:" + "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " + "ie.true_block():\n out_1 = ie.input(x)\n\n" + "Please ensure that the cond should be a 2-D tensor and " + "the second dim size of cond should be 1. " + "But now the cond's shape is [", + *mask_dim.Get(), "].\n")); + PADDLE_ENFORCE_EQ(mask_dim[1], 1, + platform::errors::InvalidArgument( + "If you are using IfElse OP:" + "\n\nie = fluid.layers.IfElse(cond=cond)\nwith " + "ie.true_block():\n out_1 = ie.input(x)\n\n" + "Please ensure that the cond should be a 2-D tensor " + "and the second dim size of cond should be 1. " + "But now the cond's shape is [", + *mask_dim.Get(), "].\n")); context->SetOutputDim("OutTrue", context->GetInputDim("X")); context->SetOutputDim("OutFalse", context->GetInputDim("X")); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 81401f73b2f1bb..6ca2ac36309517 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -282,12 +282,49 @@ DeviceContextPool::DeviceContextPool( } } +template +inline void EmplaceAsyncDeviceContext( + std::map>>* map_ptr, + platform::Place p, const int64_t stream_id) { + using PtrType = std::unique_ptr; + + auto* dev_ctx = new DevCtx(p); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto* cuda_ctx = dynamic_cast(dev_ctx); + PADDLE_ENFORCE_NOT_NULL( + cuda_ctx, platform::errors::InvalidArgument( + "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); + dev_ctx->SetAllocator( + memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get()); + dev_ctx->SetPinnedAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + + cuda_ctx->PartialInitWithAllocator(); + dev_ctx->SetGenerator( + framework::GetDefaultCUDAGenerator(p.GetDeviceId()).get()); +#endif + + dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get()); + dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CPUPlace()) + .get()); + dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(p) + .get()); + + (*map_ptr)[p].emplace(stream_id, PtrType(dev_ctx)); +} + AsyncDeviceContextPool* AsyncDeviceContextPool::pool = nullptr; -platform::DeviceContext* AsyncDeviceContextPool::Get(const platform::Place& place, const int64_t stream_id) { +platform::DeviceContext* AsyncDeviceContextPool::Get( + const platform::Place& place, const int64_t stream_id) { VLOG(6) << "AsyncDeviceContextPool Get: " << place << ", " << stream_id; if (!platform::is_gpu_place(place)) return nullptr; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto place_it = device_contexts_.find(place); if (place_it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( @@ -300,10 +337,17 @@ platform::DeviceContext* AsyncDeviceContextPool::Get(const platform::Place& plac if (device_contexts_[place].count(stream_id) > 0) { return device_contexts_[place][stream_id].get(); } else { - auto* dev_ctx = new CUDADeviceContext(place); - device_contexts_[place].emplace(stream_id, std::unique_ptr(dev_ctx)); - return dev_ctx; + // auto* dev_ctx = dynamic_cast + // device_contexts_[place].emplace(stream_id, + // std::unique_ptr( + // new platform::CUDADeviceContext(place))); + EmplaceAsyncDeviceContext(&device_contexts_, place, + stream_id); + return device_contexts_[place][stream_id].get(); } +#else + return nullptr; +#endif } AsyncDeviceContextPool::AsyncDeviceContextPool( @@ -320,7 +364,8 @@ AsyncDeviceContextPool::AsyncDeviceContextPool( for (auto& p : set) { if (platform::is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - device_contexts_.emplace(p, std::map>()); + device_contexts_.emplace( + p, std::map>()); #endif } } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 9cdfae5eb15eab..cd2d2fa6eda50a 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -941,7 +941,8 @@ class AsyncDeviceContextPool { } /*! \brief Create should only called by Init function */ - static AsyncDeviceContextPool& Init(const std::vector& places) { + static AsyncDeviceContextPool& Init( + const std::vector& places) { if (pool == nullptr) { pool = new AsyncDeviceContextPool(places); } @@ -949,11 +950,13 @@ class AsyncDeviceContextPool { } /*! \brief Return handle of single device context. */ - platform::DeviceContext* Get(const platform::Place& place, const int64_t stream_id); + platform::DeviceContext* Get(const platform::Place& place, + const int64_t stream_id); private: static AsyncDeviceContextPool* pool; - std::map>> device_contexts_; + std::map>> + device_contexts_; DISABLE_COPY_AND_ASSIGN(AsyncDeviceContextPool); }; diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h index aa1137959769ce..a6649086fc18f7 100644 --- a/paddle/fluid/platform/dynload/nvjpeg.h +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -24,34 +24,34 @@ namespace dynload { using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name -#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ - __macro(nvjpegCreateSimple); \ - __macro(nvjpegCreateEx); \ - __macro(nvjpegSetDeviceMemoryPadding); \ - __macro(nvjpegSetPinnedMemoryPadding); \ - __macro(nvjpegJpegStateCreate); \ - __macro(nvjpegJpegStreamCreate); \ - __macro(nvjpegDecodeParamsCreate); \ - __macro(nvjpegDecoderCreate); \ - __macro(nvjpegDecoderStateCreate); \ - __macro(nvjpegBufferDeviceCreate); \ - __macro(nvjpegBufferPinnedCreate); \ - __macro(nvjpegDecodeParamsSetOutputFormat); \ - __macro(nvjpegDecodeParamsSetROI); \ - __macro(nvjpegStateAttachPinnedBuffer); \ - __macro(nvjpegStateAttachDeviceBuffer); \ - __macro(nvjpegJpegStreamParse); \ - __macro(nvjpegDecodeJpegHost); \ - __macro(nvjpegDecodeJpegTransferToDevice); \ - __macro(nvjpegDecodeJpegDevice); \ - __macro(nvjpegJpegStreamDestroy); \ - __macro(nvjpegDecodeParamsDestroy); \ - __macro(nvjpegDecoderDestroy); \ - __macro(nvjpegBufferDeviceDestroy); \ - __macro(nvjpegBufferPinnedDestroy); \ - __macro(nvjpegGetImageInfo); \ - __macro(nvjpegJpegStateDestroy); \ - __macro(nvjpegDestroy); \ +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegCreateEx); \ + __macro(nvjpegSetDeviceMemoryPadding); \ + __macro(nvjpegSetPinnedMemoryPadding); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegJpegStreamCreate); \ + __macro(nvjpegDecodeParamsCreate); \ + __macro(nvjpegDecoderCreate); \ + __macro(nvjpegDecoderStateCreate); \ + __macro(nvjpegBufferDeviceCreate); \ + __macro(nvjpegBufferPinnedCreate); \ + __macro(nvjpegDecodeParamsSetOutputFormat); \ + __macro(nvjpegDecodeParamsSetROI); \ + __macro(nvjpegStateAttachPinnedBuffer); \ + __macro(nvjpegStateAttachDeviceBuffer); \ + __macro(nvjpegJpegStreamParse); \ + __macro(nvjpegDecodeJpegHost); \ + __macro(nvjpegDecodeJpegTransferToDevice); \ + __macro(nvjpegDecodeJpegDevice); \ + __macro(nvjpegJpegStreamDestroy); \ + __macro(nvjpegDecodeParamsDestroy); \ + __macro(nvjpegDecoderDestroy); \ + __macro(nvjpegBufferDeviceDestroy); \ + __macro(nvjpegBufferPinnedDestroy); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDestroy); \ __macro(nvjpegDecode); NVJPEG_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 9b085a4308567d..ffbd93b101a080 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -71,11 +71,11 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/port.h" #ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/nvjpeg.h" #include "paddle/phi/backends/dynload/cublas.h" #include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/backends/dynload/curand.h" #include "paddle/phi/backends/dynload/cusolver.h" -#include "paddle/fluid/platform/dynload/nvjpeg.h" #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) #include #include "paddle/phi/backends/dynload/nccl.h" @@ -288,7 +288,8 @@ inline const char* GetErrorMsgUrl(T status) { "index.html#cusparseStatus_t"; break; case platform::proto::ApiType::NVJPEG: - return "https://docs.nvidia.com/cuda/nvjpeg/index.html#nvjpeg-api-return-codes"; + return "https://docs.nvidia.com/cuda/nvjpeg/" + "index.html#nvjpeg-api-return-codes"; break; default: return "Unknown type of External API, can't get error message URL!"; @@ -462,19 +463,30 @@ inline std::string build_nvidia_error_msg(cufftResult_t stat) { } /**************** NVJPEG ERROR ****************/ -inline bool is_error(nvjpegStatus_t stat) { return stat != NVJPEG_STATUS_SUCCESS; } +inline bool is_error(nvjpegStatus_t stat) { + return stat != NVJPEG_STATUS_SUCCESS; +} inline std::string get_nvjpeg_error_str(nvjpegStatus_t stat) { switch (stat) { - case NVJPEG_STATUS_SUCCESS: return "NVJPEG_STATUS_SUCCESS"; - case NVJPEG_STATUS_NOT_INITIALIZED: return "NVJPEG_STATUS_NOT_INITIALIZED"; - case NVJPEG_STATUS_INVALID_PARAMETER: return "NVJPEG_STATUS_INVALID_PARAMETER"; - case NVJPEG_STATUS_BAD_JPEG: return "NVJPEG_STATUS_BAD_JPEG"; - case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; - case NVJPEG_STATUS_ALLOCATOR_FAILURE: return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; - case NVJPEG_STATUS_EXECUTION_FAILED: return "NVJPEG_STATUS_EXECUTION_FAILED"; - case NVJPEG_STATUS_ARCH_MISMATCH: return "NVJPEG_STATUS_ARCH_MISMATCH"; - case NVJPEG_STATUS_INTERNAL_ERROR: return "NVJPEG_STATUS_INTERNAL_ERROR"; + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; case NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED: return "NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED"; } @@ -540,19 +552,19 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { } \ } while (0) -#define PADDLE_ENFORCE_NVJPEG_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __NVJPEG_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __NVJPEG_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = ::paddle::platform::errors::External( \ - "Nvjpeg failed: %s", \ - ::paddle::platform::get_nvjpeg_error_str(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ +#define PADDLE_ENFORCE_NVJPEG_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __NVJPEG_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::ExternalApiType< \ + __NVJPEG_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + "Nvjpeg failed: %s", \ + ::paddle::platform::get_nvjpeg_error_str(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ } while (0) inline void retry_sleep(unsigned milliseconds) { diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index adc9c918f8116b..78f0e88e6232bf 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -235,7 +235,8 @@ void BindVarDsec(pybind11::module *m) { .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) - .value("LOD_TENSOR_BLOCKING_QUEUE", pd::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE) + .value("LOD_TENSOR_BLOCKING_QUEUE", + pd::proto::VarType::LOD_TENSOR_BLOCKING_QUEUE) .value("RAW", pd::proto::VarType::RAW) .value("STRING", pd::proto::VarType::STRING) .value("STRINGS", pd::proto::VarType::STRINGS) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f13d28b47c5e57..bcb7b09bbd4e9f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -70,8 +70,8 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" -#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/data/utils.h" +#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device/device_wrapper.h" @@ -772,15 +772,13 @@ PYBIND11_MODULE(core_noavx, m) { &paddle::operators::data::ShutDownAllDataLoaders); m.def("_shutdown_readers_and_decoders", &paddle::operators::data::ShutDownReadersAndDecoders); - m.def("_shutdown_pipeline", - &paddle::operators::data::ShutDownPipeline); + m.def("_shutdown_pipeline", &paddle::operators::data::ShutDownPipeline); m.def("_reset_dataloader", [](const int64_t reader_id, const std::vector map_ids, const int64_t pipeline_id) { - paddle::operators::data::ResetDataLoader( - reader_id, map_ids, pipeline_id); + paddle::operators::data::ResetDataLoader(reader_id, map_ids, pipeline_id); - }); + }); py::class_ custom_op_kernel_ctx( m, "CustomOpKernelContext", R"DOC()DOC"); diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h index 3537e2a1c3197f..fd40bb795f3911 100644 --- a/paddle/phi/backends/dynload/nvjpeg.h +++ b/paddle/phi/backends/dynload/nvjpeg.h @@ -36,34 +36,34 @@ extern void *nvjpeg_dso_handle; }; \ extern DynLoad__##__name __name -#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ - __macro(nvjpegCreateSimple); \ - __macro(nvjpegCreateEx); \ - __macro(nvjpegSetDeviceMemoryPadding); \ - __macro(nvjpegSetPinnedMemoryPadding); \ - __macro(nvjpegJpegStateCreate); \ - __macro(nvjpegJpegStreamCreate); \ - __macro(nvjpegDecodeParamsCreate); \ - __macro(nvjpegDecoderCreate); \ - __macro(nvjpegDecoderStateCreate); \ - __macro(nvjpegBufferDeviceCreate); \ - __macro(nvjpegBufferPinnedCreate); \ - __macro(nvjpegDecodeParamsSetOutputFormat); \ - __macro(nvjpegDecodeParamsSetROI); \ - __macro(nvjpegStateAttachPinnedBuffer); \ - __macro(nvjpegStateAttachDeviceBuffer); \ - __macro(nvjpegJpegStreamParse); \ - __macro(nvjpegDecodeJpegHost); \ - __macro(nvjpegDecodeJpegTransferToDevice); \ - __macro(nvjpegDecodeJpegDevice); \ - __macro(nvjpegJpegStreamDestroy); \ - __macro(nvjpegDecodeParamsDestroy); \ - __macro(nvjpegDecoderDestroy); \ - __macro(nvjpegBufferDeviceDestroy); \ - __macro(nvjpegBufferPinnedDestroy); \ - __macro(nvjpegGetImageInfo); \ - __macro(nvjpegJpegStateDestroy); \ - __macro(nvjpegDestroy); \ +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegCreateEx); \ + __macro(nvjpegSetDeviceMemoryPadding); \ + __macro(nvjpegSetPinnedMemoryPadding); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegJpegStreamCreate); \ + __macro(nvjpegDecodeParamsCreate); \ + __macro(nvjpegDecoderCreate); \ + __macro(nvjpegDecoderStateCreate); \ + __macro(nvjpegBufferDeviceCreate); \ + __macro(nvjpegBufferPinnedCreate); \ + __macro(nvjpegDecodeParamsSetOutputFormat); \ + __macro(nvjpegDecodeParamsSetROI); \ + __macro(nvjpegStateAttachPinnedBuffer); \ + __macro(nvjpegStateAttachDeviceBuffer); \ + __macro(nvjpegJpegStreamParse); \ + __macro(nvjpegDecodeJpegHost); \ + __macro(nvjpegDecodeJpegTransferToDevice); \ + __macro(nvjpegDecodeJpegDevice); \ + __macro(nvjpegJpegStreamDestroy); \ + __macro(nvjpegDecodeParamsDestroy); \ + __macro(nvjpegDecoderDestroy); \ + __macro(nvjpegBufferDeviceDestroy); \ + __macro(nvjpegBufferPinnedDestroy); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDestroy); \ __macro(nvjpegDecode); NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 064aa7d175af23..0d8cf47e9e252c 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -344,7 +344,7 @@ def to_list(s): from .core_noavx import _shutdown_all_dataloaders from .core_noavx import _shutdown_readers_and_decoders from .core_noavx import _shutdown_pipeline - from .core_noavx import _reset_dataloder + from .core_noavx import _reset_dataloader from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent if sys.platform != 'win32': from .core_noavx import _set_process_pids diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 67ac5d4be148b5..020aae1596b923 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -79,37 +79,44 @@ def map(map_func, *args, **kwargs): # 3. flat_inputs: holds variables in main_program/global_block in # flatten format, will be used as inputs for appendding map OP # and _parse_program_outputs follows similar logic - def _build_program_inputs(inputs, map_block, - input_vars=[], flat_inputs=[]): + def _build_program_inputs(inputs, map_block, input_vars=[], flat_inputs=[]): if isinstance(inputs, Sequence): - return [_build_program_inputs(inp, map_block, input_vars, - flat_inputs) for inp in inputs] + return [ + _build_program_inputs(inp, map_block, input_vars, flat_inputs) + for inp in inputs + ] elif isinstance(inputs, Mapping): - return {k: _build_program_inputs(v, map_block, input_vars, - flat_inputs) for k,v in inputs.items()} + return { + k: _build_program_inputs(v, map_block, input_vars, flat_inputs) + for k, v in inputs.items() + } else: var = map_block.create_var( - name=unique_name.generate("map_sub"), - type=inputs.desc.type(), - dtype=inputs.desc.dtype(), - persistable=False) + name=unique_name.generate("map_sub"), + type=inputs.desc.type(), + dtype=inputs.desc.dtype(), + persistable=False) input_vars.append(var) flat_inputs.append(inputs) return var def _parse_program_outputs(outputs, output_vars=[], flat_outputs=[]): if isinstance(outputs, Sequence): - return [_parse_program_outputs(outp, output_vars, - flat_outputs) for outp in outputs] + return [ + _parse_program_outputs(outp, output_vars, flat_outputs) + for outp in outputs + ] elif isinstance(outputs, Mapping): - return {k: _parse_program_outputs(v, output_vars, - flat_outputs) for outp in outputs} + return { + k: _parse_program_outputs(v, output_vars, flat_outputs) + for outp in outputs + } else: var = helper.create_variable( - name=unique_name.generate("map"), - type=outputs.desc.type(), - dtype=outputs.desc.dtype(), - persistable=True) + name=unique_name.generate("map"), + type=outputs.desc.type(), + dtype=outputs.desc.dtype(), + persistable=True) flat_outputs.append(var) output_vars.append(outputs) return var @@ -121,10 +128,10 @@ def _parse_program_outputs(outputs, output_vars=[], flat_outputs=[]): map_block = main_program.current_block() input_vars, flat_inputs = [], [] - program_inputs_args = _build_program_inputs( - args, map_block, input_vars, flat_inputs) - program_inputs_kwargs = _build_program_inputs( - kwargs, map_block, input_vars, flat_inputs) + program_inputs_args = _build_program_inputs(args, map_block, input_vars, + flat_inputs) + program_inputs_kwargs = _build_program_inputs(kwargs, map_block, + input_vars, flat_inputs) program_outputs = map_func(*program_inputs_args, **program_inputs_kwargs) @@ -132,8 +139,7 @@ def _parse_program_outputs(outputs, output_vars=[], flat_outputs=[]): # NOTE: _parse_program_outputs create main_program variables, so # we need to call it outside of _ProgramGuard output_vars, flat_outputs = [], [] - outputs = _parse_program_outputs(program_outputs, output_vars, - flat_outputs) + outputs = _parse_program_outputs(program_outputs, output_vars, flat_outputs) input_var_names = [v.name for v in input_vars] output_var_names = [v.name for v in output_vars] diff --git a/python/paddle/fluid/dataloader/pipeline.py b/python/paddle/fluid/dataloader/pipeline.py index 0f38163ea558d4..1aee03e4b93e45 100755 --- a/python/paddle/fluid/dataloader/pipeline.py +++ b/python/paddle/fluid/dataloader/pipeline.py @@ -26,10 +26,8 @@ __all__ = ["DataPipeline"] - CleanupFuncRegistrar.register(core._shutdown_all_dataloaders) - AVAILABLE_OP_TYPES = ['data_reader', 'map'] @@ -48,10 +46,10 @@ def __init__(self, queue_depth=2): self._init_programs() self.is_shutdown = False - + if paddle.distributed.ParallelEnv().nranks > 1: - paddle.set_device('gpu:%d' % - paddle.distributed.ParallelEnv().dev_id) + paddle.set_device('gpu:%d' % + paddle.distributed.ParallelEnv().dev_id) def _init_programs(self): self._main_program = fluid.Program() @@ -67,7 +65,7 @@ def __enter__(self): def __exit__(self, exception_type, exception_value, traceback): self._main_program = framework.switch_main_program(self._main_program) - + local_rank = paddle.distributed.get_rank() paddle.disable_static("gpu:" + str(local_rank)) @@ -142,7 +140,7 @@ def next(self): return self.__next__() def reset(self): - reader_id= _hash_with_id(self._main_program) + reader_id = _hash_with_id(self._main_program) map_ids = [] for op in self._main_program.block(0).ops: diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 6e7a2cc0da1337..ca4f1ec65bd0e3 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -328,6 +328,8 @@ def __init__(self, worker_init_fn=None, persistent_workers=False): + # Whether use multi-stream/thread GPU DataLoader + self._use_data_pipeline = False if callable(dataset): self._use_data_pipeline = True with DataPipeline() as self._data_pipeline: diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt index 0babdee3a08849..454244ec96ac21 100644 --- a/python/paddle/tests/CMakeLists.txt +++ b/python/paddle/tests/CMakeLists.txt @@ -8,6 +8,14 @@ foreach(TEST_OP ${DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() +if (WIN32) + LIST(REMOVE_ITEM TEST_OPS test_data_pipeline) + LIST(REMOVE_ITEM TEST_OPS test_ops_decode) + LIST(REMOVE_ITEM TEST_OPS test_ops_crop_resize) + LIST(REMOVE_ITEM TEST_OPS test_ops_file_label_loader) + LIST(REMOVE_ITEM TEST_OPS test_ops_mirror_normalize) + LIST(REMOVE_ITEM TEST_OPS test_data_apis) +endif() if(NOT WITH_COVERAGE) LIST(REMOVE_ITEM TEST_OPS test_hapi_hub) endif() diff --git a/python/paddle/tests/test_data_apis.py b/python/paddle/tests/test_data_apis.py index e9ea5874f36cc7..43ed9e6ff7c9c7 100644 --- a/python/paddle/tests/test_data_apis.py +++ b/python/paddle/tests/test_data_apis.py @@ -50,7 +50,8 @@ def test_output_dynamic(self): def test_output_static(self): paddle.enable_static() - input_data = paddle.static.data(shape=[16, 3, 32, 32], dtype="float32", name="input") + input_data = paddle.static.data( + shape=[16, 3, 32, 32], dtype="float32", name="input") out_data = random_flip(input_data, 0.5) places = [paddle.CPUPlace()] @@ -59,9 +60,11 @@ def test_output_static(self): for place in places: exe = paddle.static.Executor(place) - out, = exe.run(paddle.static.default_main_program(), - feed={"input": np.ones([16, 3, 32, 32], dtype="float32")}, - fetch_list=[out_data]) + out, = exe.run( + paddle.static.default_main_program(), + feed={"input": np.ones( + [16, 3, 32, 32], dtype="float32")}, + fetch_list=[out_data]) assert out.dtype == np.bool assert out.shape == (16, 1) paddle.disable_static() diff --git a/python/paddle/tests/test_data_pipeline.py b/python/paddle/tests/test_data_pipeline.py index 2c8e3124503ef2..15d6310aab3c30 100644 --- a/python/paddle/tests/test_data_pipeline.py +++ b/python/paddle/tests/test_data_pipeline.py @@ -26,11 +26,10 @@ random_flip, mirror_normalize from paddle.vision.reader import file_label_reader - DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" DATASET_MD5 = "c7110519124a433901cf005a4a91b607" -IMAGE_NUM = 100 +IMAGE_NUM = 100 class TestDataPipelineCase1(unittest.TestCase): @@ -38,7 +37,7 @@ def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) - self.num_epoches= 2 + self.num_epoches = 2 self.batch_size = 16 self.num_threads = 2 self.host_memory_padding = 1000000 @@ -74,13 +73,15 @@ def calc_iter_info(self): def build_reader(self): def imagenet_reader(): - image, label = file_label_reader(self.data_root, - batch_size=self.batch_size, - shuffle=self.shuffle, - drop_last=self.drop_last) + image, label = file_label_reader( + self.data_root, + batch_size=self.batch_size, + shuffle=self.shuffle, + drop_last=self.drop_last) + def decode(image): image = image_decode_random_crop( - image, num_threads=self.num_threads) + image, num_threads=self.num_threads) return image def resize(image): @@ -89,9 +90,8 @@ def resize(image): def flip_normalize(image): mirror = random_flip(image, prob=self.flip_prob) - image = mirror_normalize(image, mirror, - mean=self.mean, - std=self.std) + image = mirror_normalize( + image, mirror, mean=self.mean, std=self.std) return image image = paddle.io.map(decode, image) @@ -99,10 +99,14 @@ def flip_normalize(image): image = paddle.io.map(flip_normalize, image) return {'image': image, 'label': label} - + self.reader = imagenet_reader def test_static_output(self): + # NOTE: only supoort CUDA kernel currently + if not core.is_compiled_with_cuda(): + return + loader = paddle.io.DataLoader(self.reader) for eid in range(self.num_epoches): @@ -131,19 +135,13 @@ def test_static_output(self): if eid < self.num_epoches - 1: loader.reset() - del loader - - def test_shutdown(self): - loader = paddle.io.DataLoader(self.reader) - core._shutdown_all_dataloaders() - class TestDataPipelineCase2(TestDataPipelineCase1): def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) - self.num_epoches= 1 + self.num_epoches = 1 self.batch_size = 32 self.num_threads = 4 self.host_memory_padding = 0 diff --git a/python/paddle/tests/test_ops_crop_resize.py b/python/paddle/tests/test_ops_crop_resize.py index a4911f9006af8f..bb03f7f68f7c48 100644 --- a/python/paddle/tests/test_ops_crop_resize.py +++ b/python/paddle/tests/test_ops_crop_resize.py @@ -23,10 +23,7 @@ from paddle.vision.ops import image_resize, random_crop_and_resize -def np_nearest_interp(image, - size, - align_corners=True, - data_format='NCHW'): +def np_nearest_interp(image, size, align_corners=True, data_format='NCHW'): """nearest neighbor interpolation implement in shape [N, C, H, W]""" if isinstance(size, int): size = (size, size) @@ -138,8 +135,11 @@ def np_bilinear_interp(image, return out.astype(image.dtype) -def np_image_resize(images, size, interp_method, - align_corners=True, align_mode=1, +def np_image_resize(images, + size, + interp_method, + align_corners=True, + align_mode=1, data_format="NCHW"): if isinstance(size, int): size = (size, size) @@ -147,17 +147,21 @@ def np_image_resize(images, size, interp_method, results = [] if interp_method == "nearest": for image in images: - results.append(np_nearest_interp(image, - size=size, - align_corners=align_corners, - data_format=data_format)) + results.append( + np_nearest_interp( + image, + size=size, + align_corners=align_corners, + data_format=data_format)) elif interp_method == "bilinear": for image in images: - results.append(np_bilinear_interp(image, - size=size, - align_corners=align_corners, - align_mode=align_mode, - data_format=data_format)) + results.append( + np_bilinear_interp( + image, + size=size, + align_corners=align_corners, + align_mode=align_mode, + data_format=data_format)) else: raise ValueError("unknown interp_method") @@ -177,15 +181,17 @@ def setUp(self): self.build_np_data() def build_np_data(self): - self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") - self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") + self.image1 = np.random.randint( + 0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint( + 0, 256, self.image_shape2, dtype="uint8") self.np_result = np_image_resize( - [self.image1, self.image2], - size=self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) + [self.image1, self.image2], + size=self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) def test_output_dynamic(self): # NOTE: only support cuda kernel currently @@ -195,16 +201,18 @@ def test_output_dynamic(self): paddle.disable_static() images = paddle.tensor.create_array(dtype="uint8") - images = paddle.tensor.array_write(paddle.to_tensor(self.image1), - paddle.to_tensor(0), images) - images = paddle.tensor.array_write(paddle.to_tensor(self.image2), - paddle.to_tensor(1), images) - - result = image_resize(images, self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) + images = paddle.tensor.array_write( + paddle.to_tensor(self.image1), paddle.to_tensor(0), images) + images = paddle.tensor.array_write( + paddle.to_tensor(self.image2), paddle.to_tensor(1), images) + + result = image_resize( + images, + self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) assert np.allclose(result.numpy(), self.np_result, rtol=1) def test_output_static(self): @@ -220,15 +228,17 @@ def test_output_static(self): image2 = fluid.layers.assign(self.image2.astype('int32')) image2 = fluid.layers.cast(image2, dtype='uint8') - out = image_resize([image1, image2], self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) + out = image_resize( + [image1, image2], + self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) exe = paddle.static.Executor(paddle.CUDAPlace(0)) result, = exe.run(paddle.static.default_main_program(), - fetch_list=[out]) + fetch_list=[out]) assert np.allclose(result, self.np_result, rtol=1) paddle.disable_static() @@ -366,8 +376,10 @@ def setUp(self): self.build_np_data() def build_np_data(self): - self.image1 = np.random.randint(0, 256, self.image_shape1, dtype="uint8") - self.image2 = np.random.randint(0, 256, self.image_shape2, dtype="uint8") + self.image1 = np.random.randint( + 0, 256, self.image_shape1, dtype="uint8") + self.image2 = np.random.randint( + 0, 256, self.image_shape2, dtype="uint8") def test_output_dynamic(self): # NOTE: only support cuda kernel currently @@ -377,17 +389,18 @@ def test_output_dynamic(self): paddle.disable_static() images = paddle.tensor.create_array(dtype="uint8") - images = paddle.tensor.array_write(paddle.to_tensor(self.image1), - paddle.to_tensor(0), images) - images = paddle.tensor.array_write(paddle.to_tensor(self.image2), - paddle.to_tensor(1), images) + images = paddle.tensor.array_write( + paddle.to_tensor(self.image1), paddle.to_tensor(0), images) + images = paddle.tensor.array_write( + paddle.to_tensor(self.image2), paddle.to_tensor(1), images) result = random_crop_and_resize( - images, self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) + images, + self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) result = result.numpy() assert result.shape == self.out_shape assert result.dtype == np.uint8 @@ -406,15 +419,16 @@ def test_output_static(self): image2 = fluid.layers.cast(image2, dtype='uint8') out = random_crop_and_resize( - [image1, image2], self.size, - interp_method=self.interp_method, - align_corners=self.align_corners, - align_mode=self.align_mode, - data_format=self.data_format) + [image1, image2], + self.size, + interp_method=self.interp_method, + align_corners=self.align_corners, + align_mode=self.align_mode, + data_format=self.data_format) exe = paddle.static.Executor(paddle.CUDAPlace(0)) result, = exe.run(paddle.static.default_main_program(), - fetch_list=[out]) + fetch_list=[out]) assert result.shape == self.out_shape assert result.dtype == np.uint8 @@ -527,7 +541,8 @@ def setUp(self): self.build_np_data() -class TestImageCropResizeBilinearNCHWAlignCorner(TestImageCropResizeNearestNCHW): +class TestImageCropResizeBilinearNCHWAlignCorner( + TestImageCropResizeNearestNCHW): def setUp(self): self.image_shape1 = [3, 16, 16] self.image_shape2 = [3, 32, 32] diff --git a/python/paddle/tests/test_ops_decode.py b/python/paddle/tests/test_ops_decode.py index 18879f75aaa5c3..4bff90f7672fbb 100644 --- a/python/paddle/tests/test_ops_decode.py +++ b/python/paddle/tests/test_ops_decode.py @@ -22,10 +22,9 @@ import paddle.fluid.core as core from paddle.utils.download import get_path_from_url from paddle.vision.datasets import DatasetFolder -from paddle.vision.ops import image_decode, image_decode_random_crop +from paddle.vision.ops import image_decode, image_decode_random_crop from paddle.vision.reader import file_label_loader - DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" DATASET_MD5 = "c7110519124a433901cf005a4a91b607" @@ -139,14 +138,15 @@ def test_static_output(self): indices = paddle.arange(self.batch_size) image, label = file_label_loader(self.data_root, indices, self.batch_size) - image = image_decode_random_crop(image, - num_threads=self.num_threads, - aspect_ratio_min=self.aspect_ratio_min, - aspect_ratio_max=self.aspect_ratio_max, - area_min=self.area_min, - area_max=self.area_max, - num_attempts=self.num_attempts, - data_format=self.data_format) + image = image_decode_random_crop( + image, + num_threads=self.num_threads, + aspect_ratio_min=self.aspect_ratio_min, + aspect_ratio_max=self.aspect_ratio_max, + area_min=self.area_min, + area_max=self.area_max, + num_attempts=self.num_attempts, + data_format=self.data_format) exe = paddle.static.Executor(paddle.CUDAPlace(0)) rets = exe.run(paddle.static.default_main_program(), fetch_list=image + [label]) @@ -177,14 +177,15 @@ def test_dynamic_output(self): indices = paddle.arange(self.batch_size) image, label = file_label_loader(self.data_root, indices, self.batch_size) - image = image_decode_random_crop(image, - num_threads=self.num_threads, - aspect_ratio_min=self.aspect_ratio_min, - aspect_ratio_max=self.aspect_ratio_max, - area_min=self.area_min, - area_max=self.area_max, - num_attempts=self.num_attempts, - data_format=self.data_format) + image = image_decode_random_crop( + image, + num_threads=self.num_threads, + aspect_ratio_min=self.aspect_ratio_min, + aspect_ratio_max=self.aspect_ratio_max, + area_min=self.area_min, + area_max=self.area_max, + num_attempts=self.num_attempts, + data_format=self.data_format) assert len(image) == self.batch_size for i in range(self.batch_size): @@ -221,7 +222,8 @@ def setUp(self): self.channel_dim = 2 -class TestImageReaderDecodeRandomCropThread8(TestImageReaderDecodeRandomCropNCHW): +class TestImageReaderDecodeRandomCropThread8( + TestImageReaderDecodeRandomCropNCHW): def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) diff --git a/python/paddle/tests/test_ops_file_label_loader.py b/python/paddle/tests/test_ops_file_label_loader.py index 91f34dcfdb7677..17dbfa6030d7a0 100644 --- a/python/paddle/tests/test_ops_file_label_loader.py +++ b/python/paddle/tests/test_ops_file_label_loader.py @@ -26,11 +26,11 @@ from paddle.vision.datasets import DatasetFolder from paddle.vision.reader import _sampler_manager, file_label_loader - DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + class TestFileLabelLoaderStatic(unittest.TestCase): def setup(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, @@ -46,8 +46,9 @@ def setup(self): def build_program(self): paddle.enable_static() self.indices_data = paddle.static.data( - shape=[self.batch_size], dtype='int64', name='indices') - self.sample_data, self.label_data = file_label_loader(self.data_root, self.indices_data, self.batch_size) + shape=[self.batch_size], dtype='int64', name='indices') + self.sample_data, self.label_data = file_label_loader( + self.data_root, self.indices_data, self.batch_size) self.exe = paddle.static.Executor(paddle.CPUPlace()) paddle.disable_static() @@ -59,8 +60,7 @@ def loader_function(self, indices): paddle.enable_static() return self.exe.run(paddle.static.default_main_program(), feed={'indices': indices}, - fetch_list=[self.sample_data, - self.label_data]) + fetch_list=[self.sample_data, self.label_data]) def test_check_output(self): self.setup() @@ -70,9 +70,8 @@ def test_check_output(self): targets = [s[1] for s in data_folder.samples] sampler_id = fluid.layers.utils._hash_with_id( - self.data_root, self.batch_size, - self.shuffle, self.drop_last, - self.dynamic) + self.data_root, self.batch_size, self.shuffle, self.drop_last, + self.dynamic) sampler = _sampler_manager.get(sampler_id, batch_size=self.batch_size, num_samples=len(samples), @@ -93,7 +92,7 @@ def setup(self): self.batch_size = 16 self.shuffle = False self.drop_last = False - self.dynamic = True + self.dynamic = True if not self.dynamic: self.build_program() @@ -104,9 +103,9 @@ def setup(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 - self.shuffle = True + self.shuffle = True self.drop_last = False - self.dynamic = False + self.dynamic = False if not self.dynamic: self.build_program() @@ -117,9 +116,9 @@ def setup(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 - self.shuffle = True + self.shuffle = True self.drop_last = False - self.dynamic = True + self.dynamic = True if not self.dynamic: self.build_program() @@ -130,8 +129,8 @@ def setup(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 - self.shuffle = True - self.drop_last = True + self.shuffle = True + self.drop_last = True self.dynamic = False if not self.dynamic: @@ -143,8 +142,8 @@ def setup(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 - self.shuffle = True - self.drop_last = True + self.shuffle = True + self.drop_last = True self.dynamic = True if not self.dynamic: diff --git a/python/paddle/tests/test_ops_mirror_normalize.py b/python/paddle/tests/test_ops_mirror_normalize.py index 0c3270aea09148..fa536eb6629ec4 100644 --- a/python/paddle/tests/test_ops_mirror_normalize.py +++ b/python/paddle/tests/test_ops_mirror_normalize.py @@ -34,9 +34,9 @@ def np_mirror_normalize(image, mirror, mean, std): mean = np.array(mean) std = np.array(std) if np.size(mean) == 1: - mean = np.tile(mean, (3,)) + mean = np.tile(mean, (3, )) if np.size(std) == 1: - std = np.tile(std, (3,)) + std = np.tile(std, (3, )) mean = np.array(mean[:]).reshape([1, 3, 1, 1]) std = np.array(std[:]).reshape([1, 3, 1, 1]) @@ -51,20 +51,22 @@ def setUp(self): self.mean = [123.675, 116.28, 103.53] self.std = [58.395, 57.120, 57.375] - self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") - self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + self.image = np.random.randint(0, 256, self.image_shape, + 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, + 'int32').astype("bool") - self.result = np_mirror_normalize(self.image, self.mirror, - self.mean, self.std) + self.result = np_mirror_normalize(self.image, self.mirror, self.mean, + self.std) def test_check_output_dynamic(self): # NOTE: only supoort CUDA kernel currently if not core.is_compiled_with_cuda(): return - dy_result = mirror_normalize(paddle.to_tensor(self.image), - paddle.to_tensor(self.mirror), - self.mean, self.std) + dy_result = mirror_normalize( + paddle.to_tensor(self.image), + paddle.to_tensor(self.mirror), self.mean, self.std) assert np.allclose(self.result, dy_result.numpy()) def test_check_output_static(self): @@ -74,14 +76,12 @@ def test_check_output_static(self): paddle.enable_static() - image_data = paddle.static.data(shape=self.image_shape, - dtype='float32', - name="image") - mirror_data = paddle.static.data(shape=self.mirror_shape, - dtype='bool', - name="mirror") - result_data = mirror_normalize(image_data, mirror_data, - self.mean, self.std) + image_data = paddle.static.data( + shape=self.image_shape, dtype='float32', name="image") + mirror_data = paddle.static.data( + shape=self.mirror_shape, dtype='bool', name="mirror") + result_data = mirror_normalize(image_data, mirror_data, self.mean, + self.std) # NOTE: only supoort CUDA kernel currently places = [] @@ -90,10 +90,11 @@ def test_check_output_static(self): for place in places: exe = paddle.static.Executor(place) - st_result = exe.run(paddle.static.default_main_program(), - feed={"image": self.image, - "mirror": self.mirror}, - fetch_list=[result_data]) + st_result = exe.run( + paddle.static.default_main_program(), + feed={"image": self.image, + "mirror": self.mirror}, + fetch_list=[result_data]) assert np.allclose(self.result, st_result) @@ -107,11 +108,13 @@ def setUp(self): self.mean = [123.675] self.std = [58.395] - self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") - self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + self.image = np.random.randint(0, 256, self.image_shape, + 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, + 'int32').astype("bool") - self.result = np_mirror_normalize(self.image, self.mirror, - self.mean, self.std) + self.result = np_mirror_normalize(self.image, self.mirror, self.mean, + self.std) class TestMirrorNormalizeFloatMeanStd(TestMirrorNormalize): @@ -121,11 +124,13 @@ def setUp(self): self.mean = 123.675 self.std = 58.395 - self.image = np.random.randint(0, 256, self.image_shape, 'int32').astype("float32") - self.mirror = np.random.randint(0, 2, self.mirror_shape, 'int32').astype("bool") + self.image = np.random.randint(0, 256, self.image_shape, + 'int32').astype("float32") + self.mirror = np.random.randint(0, 2, self.mirror_shape, + 'int32').astype("bool") - self.result = np_mirror_normalize(self.image, self.mirror, - self.mean, self.std) + self.result = np_mirror_normalize(self.image, self.mirror, self.mean, + self.std) if __name__ == '__main__': From 210f5b5ae10107a3357f18a6a170f1f085e6cf7b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Apr 2022 09:10:11 +0000 Subject: [PATCH 90/95] fix unittest. test=develop --- python/paddle/tests/CMakeLists.txt | 1 + python/paddle/tests/test_ops_crop_resize.py | 31 ---------------- .../tests/test_ops_file_label_loader.py | 37 ++++++------------- 3 files changed, 13 insertions(+), 56 deletions(-) diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt index 1ba5e2175a3d9d..3cfb36e08bc669 100644 --- a/python/paddle/tests/CMakeLists.txt +++ b/python/paddle/tests/CMakeLists.txt @@ -10,6 +10,7 @@ endforeach() if (WIN32) LIST(REMOVE_ITEM TEST_OPS test_data_pipeline) + LIST(REMOVE_ITEM TEST_OPS test_ops_file_label_loader) LIST(REMOVE_ITEM TEST_OPS test_ops_decode) LIST(REMOVE_ITEM TEST_OPS test_ops_crop_resize) LIST(REMOVE_ITEM TEST_OPS test_ops_file_label_loader) diff --git a/python/paddle/tests/test_ops_crop_resize.py b/python/paddle/tests/test_ops_crop_resize.py index bb03f7f68f7c48..87df7a2eb37689 100644 --- a/python/paddle/tests/test_ops_crop_resize.py +++ b/python/paddle/tests/test_ops_crop_resize.py @@ -492,22 +492,6 @@ def setUp(self): self.out_shape = (2, 3, 20, 30) - self._is_np_built = False - self.build_np_data() - - -class TestImageCropResizeNearestNHWC(TestImageCropResizeNearestNCHW): - def setUp(self): - self.image_shape1 = [16, 16, 3] - self.image_shape2 = [32, 32, 3] - self.size = (20, 30) - self.interp_method = "bilinear" - self.data_format = "NHWC" - self.align_corners = False - self.align_mode = 1 - - self.out_shape = (2, 20, 30, 3) - self.build_np_data() @@ -557,20 +541,5 @@ def setUp(self): self.build_np_data() -class TestImageCropResizeNearestNHWCAlignCorner(TestImageCropResizeNearestNCHW): - def setUp(self): - self.image_shape1 = [16, 16, 3] - self.image_shape2 = [32, 32, 3] - self.size = (20, 30) - self.interp_method = "bilinear" - self.data_format = "NHWC" - self.align_corners = True - self.align_mode = 1 - - self.out_shape = (2, 20, 30, 3) - - self.build_np_data() - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/tests/test_ops_file_label_loader.py b/python/paddle/tests/test_ops_file_label_loader.py index 17dbfa6030d7a0..e8137d190c9c08 100644 --- a/python/paddle/tests/test_ops_file_label_loader.py +++ b/python/paddle/tests/test_ops_file_label_loader.py @@ -32,7 +32,7 @@ class TestFileLabelLoaderStatic(unittest.TestCase): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -40,9 +40,6 @@ def setup(self): self.drop_last = False self.dynamic = False - if not self.dynamic: - self.build_program() - def build_program(self): paddle.enable_static() self.indices_data = paddle.static.data( @@ -63,7 +60,12 @@ def loader_function(self, indices): fetch_list=[self.sample_data, self.label_data]) def test_check_output(self): - self.setup() + # NOTE: only support cuda kernel currently + if not core.is_compiled_with_cuda(): + return + + if not self.dynamic: + self.build_program() data_folder = DatasetFolder(self.data_root) samples = [s[0] for s in data_folder.samples] @@ -86,7 +88,7 @@ def test_check_output(self): class TestFileLabelLoaderDynamic(TestFileLabelLoaderStatic): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -94,12 +96,9 @@ def setup(self): self.drop_last = False self.dynamic = True - if not self.dynamic: - self.build_program() - class TestFileLabelLoaderStaticShuffle(TestFileLabelLoaderStatic): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -107,12 +106,9 @@ def setup(self): self.drop_last = False self.dynamic = False - if not self.dynamic: - self.build_program() - class TestFileLabelLoaderDynamicShuffle(TestFileLabelLoaderStatic): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -120,12 +116,9 @@ def setup(self): self.drop_last = False self.dynamic = True - if not self.dynamic: - self.build_program() - class TestFileLabelLoaderStaticDropLast(TestFileLabelLoaderStatic): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -133,12 +126,9 @@ def setup(self): self.drop_last = True self.dynamic = False - if not self.dynamic: - self.build_program() - class TestFileLabelLoaderDynamicDropLast(TestFileLabelLoaderStatic): - def setup(self): + def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) self.batch_size = 16 @@ -146,9 +136,6 @@ def setup(self): self.drop_last = True self.dynamic = True - if not self.dynamic: - self.build_program() - if __name__ == '__main__': unittest.main() From a559fb2db2c606c9b6d32db3044f5b1aa1d8d4fe Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Apr 2022 14:33:35 +0000 Subject: [PATCH 91/95] complete docs. test=develop --- .../data/batch_random_crop_and_resize_op.cc | 4 +- .../data/batch_random_crop_and_resize_op.cu | 3 +- .../fluid/operators/data/batch_resize_op.cc | 5 +- .../fluid/operators/data/batch_resize_op.cu | 6 +- paddle/fluid/operators/data/dataloader_op.cc | 4 - paddle/fluid/operators/data/utils.h | 3 - paddle/fluid/pybind/op_function_generator.h | 1 + python/paddle/fluid/dataloader/ops.py | 116 +++++++++- python/paddle/fluid/reader.py | 14 +- python/paddle/vision/ops.py | 208 ++++++++++++------ python/paddle/vision/reader.py | 82 +++++-- 11 files changed, 348 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc index c47c0a246d1b68..2f86998c68680d 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc @@ -122,4 +122,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(batch_random_crop_and_resize, - ops::data::BatchRandomCropAndResizeCPUKernel) + ops::data::BatchRandomCropAndResizeCPUKernel, + ops::data::BatchRandomCropAndResizeCPUKernel, + ops::data::BatchRandomCropAndResizeCPUKernel); diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu index 2b04e908fe4512..bed4409aa777ab 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cu @@ -346,4 +346,5 @@ class BatchRandomCropAndResizeCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(batch_random_crop_and_resize, ops::data::BatchRandomCropAndResizeCUDAKernel, - ops::data::BatchRandomCropAndResizeCUDAKernel); + ops::data::BatchRandomCropAndResizeCUDAKernel, + ops::data::BatchRandomCropAndResizeCUDAKernel); diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc index e46f12cb6b23ed..590c61c522bfd1 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cc +++ b/paddle/fluid/operators/data/batch_resize_op.cc @@ -107,4 +107,7 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(batch_resize, ops::data::BatchResizeCPUKernel) +REGISTER_OP_CPU_KERNEL(batch_resize, + ops::data::BatchResizeCPUKernel, + ops::data::BatchResizeCPUKernel, + ops::data::BatchResizeCPUKernel) diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu index e1164043b8c117..dc099daaa2ccc3 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cu +++ b/paddle/fluid/operators/data/batch_resize_op.cu @@ -262,5 +262,7 @@ class BatchResizeCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(batch_resize, ops::data::BatchResizeCUDAKernel, - ops::data::BatchResizeCUDAKernel); +REGISTER_OP_CUDA_KERNEL(batch_resize, + ops::data::BatchResizeCUDAKernel, + ops::data::BatchResizeCUDAKernel, + ops::data::BatchResizeCUDAKernel); diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index 12b7217bf985ea..accf9598c519e4 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -62,10 +62,6 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { "(int64_t)" "The unique hash id used as cache key for " "ExecutorInfoCache"); - // AddAttr("prefetch_depth", - // "(int64_t)" - // "The prefetch batch number") - // .SetDefault(2); AddComment(R"DOC( DataLoader Op )DOC"); diff --git a/paddle/fluid/operators/data/utils.h b/paddle/fluid/operators/data/utils.h index 5d609db8bbf9e0..7b63cb0d3d2867 100644 --- a/paddle/fluid/operators/data/utils.h +++ b/paddle/fluid/operators/data/utils.h @@ -40,9 +40,6 @@ void ShutDownAllDataLoaders() { // step 3: shutdown MapRunner MapRunnerManager::Instance()->ShutDown(); - - // step 3: shutdown Pipeline - PipelineManager::Instance()->ShutDown(); } void ShutDownReadersAndDecoders(const int64_t program_id) { diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 74434c8cc2e41b..a24e2e1bb4b867 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -242,6 +242,7 @@ std::map> op_passing_outs_map = { {"run_program", {"Out", "DOut", "OutScope"}}, {"dataloader", {"Out"}}, {"map", {"Out"}}, + {"file_label_loader", {"Image"}}, {"clear_float_status", {"FloatStatusOut"}}, {"get_float_status", {"FloatStatusOut"}}, {"assign", {"Out"}}, diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 020aae1596b923..6a66d6af1a87e3 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -18,7 +18,7 @@ from ...fluid import core, framework, Program, program_guard, unique_name from ...fluid.layers.utils import _hash_with_id -from ...fluid.framework import in_dygraph_mode +from ...fluid.framework import _non_static_mode from ...common_ops_import import * from collections.abc import Sequence, Mapping @@ -63,7 +63,59 @@ def _generate_stream_id(): def map(map_func, *args, **kwargs): - if in_dygraph_mode(): + """ + This API used to split data loading stages of :attr:`DataPipeline`, the + map function will be run in independent C++ thread and stream. + + Args: + map_func (callable): A callable function construct of data + preprocess OPs. + + Returns: + The output of map function + + Examples: + .. code-block:: python + + import os + import paddle + from paddle.utils.download import get_path_from_url + + DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") + DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" + DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + BATCH_SIZE = 100 + + data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + def imagenet_pipeline(): + image, label = paddle.vision.reader.file_label_reader( + data_root, batch_size=BATCH_SIZE) + + def decode(image): + image = paddle.vision.ops.image_decode_random_crop(image, num_threads=4) + return image + def resize(image): + image = paddle.vision.ops.image_resize(image, size=224) + return image + def flip_normalize(image): + mirror = paddle.vision.ops.random_flip(image, prob=0.5) + image = paddle.vision.ops.mirror_normalize(image, mirror) + return image + + image = paddle.io.map(decode, image) + image = paddle.io.map(resize, image) + image = paddle.io.map(flip_normalize, image) + + return {'image': image, 'label': label} + + dataloader = paddle.io.DataLoader(imagenet_pipeline) + for data in dataloader: + print(data['image'].shape, data['label'].shape) + + """ + if _non_static_mode(): return map_func(inputs) helper = LayerHelper("map", **locals()) @@ -169,7 +221,65 @@ def data_reader(reader_func, shuffle=False, drop_last=False, seed=None): - assert not in_dygraph_mode(), \ + """ + This API used to auto loading dataset in :attr:`DataPipeline`, the + reader function will be run in independent C++ thread. + + Args: + reader_func (callable): A callable function construct of a data + loader OP. + batch_size (int): The batch size of a mini-batch. Default 1. + shuffle (bool): Whether to shuffle samples. Default False. + drop_last (bool): Whether to drop the last incomplete batch. Default False. + seed (int, optional): The seed for sample shuffling. Default None. + + Returns: + The output of reader function + + Examples: + .. code-block:: python + + import os + import paddle + from paddle.utils.download import get_path_from_url + + DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") + DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" + DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + BATCH_SIZE = 100 + NUM_SAMPLES = 100 + + data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + def imagenet_pipeline(): + def imagenet_reader(indices): + return paddle.vision.reader.file_label_loader( + data_root, indices, BATCH_SIZE) + + outs = paddle.io.data_reader(imagenet_reader, + BATCH_SIZE, NUM_SAMPLES) + image = outs[:-1] + label = outs[-1] + + def decode(image): + image = paddle.vision.ops.image_decode_random_crop(image, num_threads=4) + return image + def resize(image): + image = paddle.vision.ops.image_resize(image, size=224) + return image + + image = paddle.io.map(decode, image) + image = paddle.io.map(resize, image) + + return {'image': image, 'label': label} + + dataloader = paddle.io.DataLoader(imagenet_pipeline) + for data in dataloader: + print(data['image'].shape, data['label'].shape) + + """ + assert not _non_static_mode(), \ "paddle.io.data_reader can only be used in static mode" helper = LayerHelper("data_reader", **locals()) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index ca4f1ec65bd0e3..907fe62a1c9f15 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -184,9 +184,17 @@ class DataLoader(object): Args: - dataset(Dataset|callable): the dataset to load data from, should be an - instance of subclass of :code:`paddle.io.Dataset` or - :code:`paddle.io.IterableDataset`. + dataset(Dataset|callable): the dataset to load data from, there + are 2 available types: + 1. an instance of subclass of :code:`paddle.io.Dataset` or + :code:`paddle.io.IterableDataset` for Python multi-process + DataLoader. + 2. a callable function constructed with + :code:`paddle.io.data_reader`, :code:`paddle.io.map` and other + data processing OPs from :code:`paddle.vision.ops` for C++ + multi-thread and multi-stream DataLoader. Only support data + preprocessing of ImageNet dataset currently. Please see + :code:`paddle.io.map` for example codes. feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list. The Tensors should be created by :code:`paddle.static.data()`. :attr:`feed_list` must be set if :attr:`return_list` is diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 1f4ac2febe1e50..2bc032b4af1f4d 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -846,18 +846,20 @@ def read_file(filename, name=None): Examples: .. code-block:: python - - import cv2 import paddle + from paddle.utils.download import get_path_from_url - fake_img = (np.random.random( - (400, 300, 3)) * 255).astype('uint8') + DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") + DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" + DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + BATCH_SIZE = 16 - cv2.imwrite('fake.jpg', fake_img) - - img_bytes = paddle.vision.ops.read_file('fake.jpg') - - print(img_bytes.shape) + data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + indices = paddle.arange(BATCH_SIZE) + outs = paddle.vision.reader.file_label_loader(data_root, + indices, BATCH_SIZE) + print(outs[0].shape) """ @@ -881,36 +883,46 @@ def image_decode(x, device_memory_padding=0, name=None): """ - Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. - Optionally converts the image to the desired format. - The values of the output tensor are uint8 between 0 and 255. + Decodes a batch of JPEG images into a list of 3 dimensional RGB + Tensors with multi-threads and Nvjpeg. Default Nvjpeg decoding + output format is RGBI, for detail infomations, please see + https://docs.nvidia.com/cuda/nvjpeg/index.html. + + This api is only available for Paddle GPU version + + The values of the output tensors are uint8 between 0 and 255. Args: - x (Tensor): A one dimensional uint8 tensor containing the raw bytes - of the JPEG image. - mode (str): The read mode used for optionally converting the image. - Default: 'unchanged'. - num_threads (int): parallel thread number. + x (List[Tensor]): A list of one dimensional uint8 Tensors + containing the raw bytes of the JPEG image. + num_threads (int): The parallel thread number for decoding + host_memory_padding (int): The CUDA pinned memory allocation + padding size of Nvjpeg decoding. Default 0. + host_memory_padding (int): The CUDA memory allocation padding + size of Nvjpeg decoding. Default 0. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. + Returns: - Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width) + Tensor: A list of decoded image tensors with shape of + (imge_channels, image_height, image_width) Examples: .. code-block:: python import cv2 import paddle - + import numpy as np + fake_img = (np.random.random( - (400, 300, 3)) * 255).astype('uint8') - + (400, 300, 3)) * 255).astype('uint8') + cv2.imwrite('fake.jpg', fake_img) - + img_bytes = paddle.vision.ops.read_file('fake.jpg') - img = paddle.vision.ops.decode_jpeg(img_bytes) - - print(img.shape) + imgs = paddle.vision.ops.image_decode([img_bytes]) + + print(imgs[0].shape) """ local_rank = paddle.distributed.get_rank() @@ -959,40 +971,64 @@ def image_decode_random_crop(x, num_attempts=10, name=None): """ - Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. - Optionally converts the image to the desired format. - The values of the output tensor are uint8 between 0 and 255. + Decodes and performs random cropping on a batch of JPEG images into + a list of 3 dimensional RGB Tensors with multi-threads and Nvjpeg. + Default Nvjpeg decoding output format is RGBI, for detail infomations, + please see https://docs.nvidia.com/cuda/nvjpeg/index.html. + + This api is only available for Paddle GPU version + + The values of the output tensors are uint8 between 0 and 255. Args: - x (Tensor): A one dimensional uint8 tensor containing the raw bytes - of the JPEG image. - num_threads (int): parallel thread number. - aspect_ratio_min (float): - aspect_ratio_max (float): - area_min (float): - area_max (float): - num_attempts (int): + x (List[Tensor]): A list of one dimensional uint8 Tensors + containing the raw bytes of the JPEG image. + num_threads (int): The parallel thread number for decoding + host_memory_padding (int): The CUDA pinned memory allocation + padding size of Nvjpeg decoding. Default 0. + host_memory_padding (int): The CUDA memory allocation padding + size of Nvjpeg decoding. Default 0. + data_format (string): The output image format, if NCHW, output + images will be in shape of (channels, image_height, + image_width), if NHWC, output images will be in shape of + (image_height, image_width, channels). Default NCHW. + aspect_ratio_min (float): The minimum aspect ratio of random + cropping boxes, this should be a value between 0 and + 1. Default :attr:`3. / 4.`. + aspect_ratio_max (float): The maximum aspect ratio of random + cropping boxes, this should be a value greater than 1. + Default :attr:`4. / 3.`. + area_min (float): The minimum area ratio of random cropping boxes, + this should be a value between 0 and 1. Default 0.08. + area_max (float): The maximum area ratio of random cropping boxes, + this should be a value between 0 and 1. Default 1. + num_attempts (int): The max attempt number to find random cropping + boxes, this should be a position integer. Default 10. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. + Returns: - Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width) + Tensor: A list of decoded image tensors with shape of + (imge_channels, image_height, image_width) Examples: .. code-block:: python import cv2 import paddle - + import numpy as np + fake_img = (np.random.random( - (400, 300, 3)) * 255).astype('uint8') - + (400, 300, 3)) * 255).astype('uint8') + cv2.imwrite('fake.jpg', fake_img) - + img_bytes = paddle.vision.ops.read_file('fake.jpg') - img = paddle.vision.ops.decode_jpeg(img_bytes) - - print(img.shape) + imgs = paddle.vision.ops.image_decode_random_crop([img_bytes]) + + print(imgs[0].shape) """ + local_rank = paddle.distributed.get_rank() if in_dygraph_mode(): out = core.VarBase(core.VarDesc.VarType.UINT8, [], @@ -1039,6 +1075,33 @@ def image_decode_random_crop(x, def random_flip(x, prob=0.5, name=None): + """ + This API generates flipping mirror flags for input Tensor, it treats + the 1st dimension as batch size and generates a bool value of whether + to flip the input samples for each sample. + + Args: + x (Tensor): The input tensor in shape of [N, ...], N if the batch + size to generate random flipping mirror flags. + prob (float): The probability for flip the input samples, this + should be a float value between 0 and 1. Default 0.5 + name (str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + + Returns: + Tensor: A bool Tensor in shape of [N, 1], N is the shape of 1st + dimension of input Tensor. + + Examples: + .. code-block:: python + import paddle + + x = paddle.rand(shape=[8, 3, 32, 32]) + mirror = paddle.vision.ops.random_flip(x) + + print(mirror) + """ if prob < 0. or prob > 1.: raise ValueError("prob should in (0, 1) in random_flip") @@ -1545,12 +1608,24 @@ def random_crop_and_resize(x, Args: x (List[Tensor]): A list of input images, 3D-Tensor with the shape of [C,H,W] or [H,W,c]. The data type is uint8 or float32. - size (int|list|tuple): Target size of output image, - with (height, width) shape. - scale (list|tuple): Scale range of the cropped image before resizing, - relatively to the origin image. Default: (0.08, 1.0) - ratio (list|tuple): Range of aspect ratio of the origin aspect ratio - cropped. Default: (0.75, 1.33) + size (int|list|tuple): Target size of output image, with (height, + width) shape. + aspect_ratio_min (float): The minimum aspect ratio of random + cropping boxes, this should be a value between 0 and + 1. Default :attr:`3. / 4.`. + aspect_ratio_max (float): The maximum aspect ratio of random + cropping boxes, this should be a value greater than 1. + Default :attr:`4. / 3.`. + area_min (float): The minimum area ratio of random cropping boxes, + this should be a value between 0 and 1. Default 0.08. + area_max (float): The maximum area ratio of random cropping boxes, + this should be a value between 0 and 1. Default 1. + num_attempts (int): The max attempt number to find random cropping + boxes, this should be a position integer. Default 10. + data_format (string): The input image format, if NCHW, input + images will be in shape of (channels, image_height, + image_width), if NHWC, input images will be in shape of + (image_height, image_width, channels). Default NCHW. interp_method (str, optional): Interpolation method. Default: 'bilinear'. support method are as following: - "nearest", @@ -1570,18 +1645,22 @@ def random_crop_and_resize(x, default. Returns: - Tensor: The output of RandomCropAndResizeOp is a 4-D tensor with shape - (batch_size, channels, h, w). The data type is uint8 or float32. + Tensor: The output is a 4-D tensor with shape (batch_size, + channels, h, w) or (batch_random_crop_and_resize, h, w, + channels). The data type is uint8 or float32. Examples: .. code-block:: python import paddle - from paddle.vision.ops import batch_random_crop_and_resize - data = paddle.rand([3, 256, 256]) - out = batch_random_crop_and_resize([data]) + data = paddle.randn(shape=[3, 256, 256]) + data = paddle.cast(data, dtype='uint8') + out = paddle.vision.ops.random_crop_and_resize([data], size=224) + + print(out.shape) """ + check_type(size, 'size', (int, tuple), 'batch_random_crop_and_resize') assert interp_method in ['bilinear', 'nearest'] assert data_format in ['NCHW', 'NHWC'] @@ -1638,9 +1717,9 @@ def image_resize(x, Args: x (List[Tensor]): A list of input images, 3D-Tensor with the shape - of [C,H,W] or [H,W,c]. The data type is uint8 or float32. - size (int|list|tuple): Target size of output image, - with (height, width) shape. + of [C, H, W] or [H, W, C]. The data type is uint8 or float32. + size (int|list|tuple): Target size of output image, with (height, + width) shape. interp_method (str, optional): Interpolation method. Default: 'bilinear'. support method are as following: - "nearest", @@ -1660,17 +1739,20 @@ def image_resize(x, default. Returns: - Tensor: The output of RandomCropAndResizeOp is a 4-D tensor with shape - (batch_size, channels, h, w). The data type is uint8 or float32. + Tensor: The output of image_resizeis a 4-D tensor with shape + (batch_size, channels, h, w) or (batch_resize, h, w, channels). + The data type is uint8 or float32. Examples: .. code-block:: python import paddle - from paddle.vision.ops import image_resize - data = paddle.rand([3, 256, 256]) - out = image_resize([data]) + data = paddle.randn(shape=[3, 256, 256]) + data = paddle.cast(data, dtype='uint8') + out = paddle.vision.ops.image_resize([data], size=224) + + print(out.shape) """ check_type(size, 'size', (int, tuple), 'image_resize') assert interp_method in ['bilinear', 'nearest'] diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py index 7c8c22e2f676b0..400ac1bf3214ec 100644 --- a/python/paddle/vision/reader.py +++ b/python/paddle/vision/reader.py @@ -16,7 +16,7 @@ from ..fluid.layer_helper import LayerHelper, unique_name from ..fluid import core, layers from ..fluid.layers import nn, utils -from ..fluid.framework import in_dygraph_mode +from ..fluid.framework import _non_static_mode import paddle from paddle.common_ops_import import * @@ -87,15 +87,43 @@ def file_label_loader(data_root, indices, batch_size, name=None): Reads a batch of data, outputs the bytes contents of a file as a uint8 Tensor with one dimension. + This API can only be used in Paddle GPU version. + Args: - data_root (str): root directory of data - indices (list of int): batch indices of samples + data_root (str): root directory of ImageNet dataset. + indices (Tensor): A Tensor of batch indices of samples in shape of + [N], while N is the batch size. + batch_size (int): The batch size, same as shape of indices. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A list of image Tensor holds byte streams of a batch of images and + A Tensor of label Tensor. + + Examples: + .. code-block:: python + + import os + import paddle + from paddle.utils.download import get_path_from_url + + DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") + DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" + DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + BATCH_SIZE = 16 + + data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + indices = paddle.arange(BATCH_SIZE) + images, labels = paddle.vision.reader.file_label_loader( + data_root, indices, BATCH_SIZE) + print(images[0].shape, labels.shape) + """ - if in_dygraph_mode(): + if _non_static_mode(): image = [ core.VarBase(core.VarDesc.VarType.UINT8, [], unique_name.generate("file_label_loader"), @@ -130,40 +158,61 @@ def file_label_loader(data_root, indices, batch_size, name=None): return image, label -def file_label_reader(file_root, +def file_label_reader(data_root, batch_size=1, shuffle=False, drop_last=False, seed=None): """ - Reads and outputs the bytes contents of a file as a uint8 Tensor - with one dimension. + Reads batches of data iterably, outputs the bytes contents of a file + as a uint8 Tensor with one dimension. + + This API will start a C++ thread to load data with + :attr:`file_label_loader`, and yiled data iterably. + + This API can only be used in Paddle GPU version. Args: - filename (str): Path of the file to be read. + data_root (str): root directory of ImageNet dataset. + batch_size (int): The batch size of a mini-batch. Default 1. + shuffle (bool): Whether to shuffle samples. Default False. + drop_last (bool): Whether to drop the last incomplete batch. Default False. + seed (int, optional): The seed for sample shuffling. Default None. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: - A uint8 tensor. + A list of image Tensor holds byte streams of a batch of images and + A Tensor of label Tensor. Examples: .. code-block:: python - import cv2 + import os import paddle + from paddle.utils.download import get_path_from_url - image = paddle.vision.ops.file_label_reader('/workspace/datasets/ILSVRC2012/val/', 2) + DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") + DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" + DATASET_MD5 = "c7110519124a433901cf005a4a91b607" + BATCH_SIZE = 16 + + data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + images, labels = paddle.vision.reader.file_label_reader( + data_root, BATCH_SIZE) + print(images[0].shape, labels.shape) """ + from paddle.vision.datasets import DatasetFolder - data_folder = DatasetFolder(file_root) + data_folder = DatasetFolder(data_root) samples = [s[0] for s in data_folder.samples] targets = [s[1] for s in data_folder.samples] - if in_dygraph_mode(): - sample_id = utils._hash_with_id(file_root, batch_size, shuffle, + if _non_static_mode(): + sample_id = utils._hash_with_id(data_root, batch_size, shuffle, drop_last) sampler = _sampler_manager.get(sample_id, batch_size=batch_size, @@ -171,11 +220,10 @@ def file_label_reader(file_root, shuffle=shuffle, drop_last=drop_last) indices = paddle.to_tensor(next(sampler), dtype='int64') - outs = file_label_loader(file_root, indices, batch_size) - return outs[:-1], outs[-1] + return file_label_loader(data_root, indices, batch_size) def _reader(indices): - return file_label_loader(file_root, indices, batch_size) + return file_label_loader(data_root, indices, batch_size) outs = paddle.io.data_reader( _reader, From a01f87035a65dd784f63f3e6fe557686360a3f0b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Apr 2022 14:58:13 +0000 Subject: [PATCH 92/95] add C++ docs. test=develop --- .../fluid/operators/data/batch_decode_op.cc | 6 ++--- .../data/batch_decode_random_crop_op.cc | 27 ++++++++++++------- .../data/batch_random_crop_and_resize_op.cc | 20 ++++++++++---- .../fluid/operators/data/batch_resize_op.cc | 2 +- paddle/fluid/operators/data/dataloader_op.cc | 8 +++++- .../operators/data/file_label_loader_op.cc | 7 ++++- paddle/fluid/operators/data/map_op.cc | 3 ++- .../operators/data/mirror_normalize_op.cc | 3 ++- 8 files changed, 53 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc index 04a366dc6f3c23..bb3367d4c47909 100644 --- a/paddle/fluid/operators/data/batch_decode_op.cc +++ b/paddle/fluid/operators/data/batch_decode_op.cc @@ -60,10 +60,8 @@ class BatchDecodeOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddOutput("Out", "The output tensor of BatchDecodeOp").AsDuplicable(); AddComment(R"DOC( -This operator decodes a JPEG image into a 3 dimensional RGB Tensor -or 1 dimensional Gray Tensor. Optionally converts the image to the -desired format. The values of the output tensor are uint8 between 0 -and 255. +This operator decodes a JPEG image into a 3 dimensional RGB Tensor. +The values of the output tensor are uint8 between 0 and 255. )DOC"); AddAttr("num_threads", "Path of the file to be readed.").SetDefault(2); AddAttr("local_rank", diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc index 020c400eb5992a..691d738a87a110 100644 --- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc +++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc @@ -105,10 +105,9 @@ class BatchDecodeRandomCropOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "The output tensor of BatchDecodeRandomCropOp") .AsDuplicable(); AddComment(R"DOC( -This operator decodes a JPEG image into a 3 dimensional RGB Tensor -or 1 dimensional Gray Tensor. Optionally converts the image to the -desired format. The values of the output tensor are uint8 between 0 -and 255. +This operator decodes a JPEG image into a 3 dimensional RGB Tensor. +Optionally converts the image to the desired format. +The values of the output tensor are uint8 between 0 and 255. )DOC"); AddAttr("local_rank", "(int64_t)" @@ -129,11 +128,21 @@ and 255. "Specify that the data format of the input and output data is " "channel_first or channel_last.") .SetDefault("NCHW"); - AddAttr("aspect_ratio_min", "").SetDefault(3. / 4.); - AddAttr("aspect_ratio_max", "").SetDefault(4. / 3.); - AddAttr("area_min", "").SetDefault(0.08); - AddAttr("area_max", "").SetDefault(1.); - AddAttr("num_attempts", "").SetDefault(10); + AddAttr("aspect_ratio_min", + "(float) The minimum aspect ratio of random cropping boxes") + .SetDefault(3. / 4.); + AddAttr("aspect_ratio_max", + "(float) The maximum aspect ratio of random cropping boxes") + .SetDefault(4. / 3.); + AddAttr("area_min", + "(float) The min area ratio of random cropping boxes") + .SetDefault(0.08); + AddAttr("area_max", + "(float) The max area ratio of random cropping boxes") + .SetDefault(1.); + AddAttr("num_attempts", + "(int) The max attempt number of random cropping boxes") + .SetDefault(10); AddAttr("program_id", "(int64_t)" "The unique hash id used as cache key for " diff --git a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc index 2f86998c68680d..55a2e23cdab9ec 100644 --- a/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc +++ b/paddle/fluid/operators/data/batch_random_crop_and_resize_op.cc @@ -69,11 +69,21 @@ class BatchRandomCropAndResizeOpMaker AddInput("X", "(List(Tensor)). A batch of instances to random crop.") .AsDuplicable(); AddOutput("Out", "(Tensor). The cropped instance batch."); - AddAttr("aspect_ratio_min", "").SetDefault(3. / 4.); - AddAttr("aspect_ratio_max", "").SetDefault(4. / 3.); - AddAttr("area_min", "").SetDefault(0.08); - AddAttr("area_max", "").SetDefault(1.); - AddAttr("num_attempts", "").SetDefault(10); + AddAttr("aspect_ratio_min", + "(float) The minimum aspect ratio of random cropping boxes") + .SetDefault(3. / 4.); + AddAttr("aspect_ratio_max", + "(float) The maximum aspect ratio of random cropping boxes") + .SetDefault(4. / 3.); + AddAttr("area_min", + "(float) The min area ratio of random cropping boxes") + .SetDefault(0.08); + AddAttr("area_max", + "(float) The max area ratio of random cropping boxes") + .SetDefault(1.); + AddAttr("num_attempts", + "(int) The max attempt number of random cropping boxes") + .SetDefault(10); AddAttr>( "size", "expected output size of the crop, for each edge."); AddAttr("interp_method", diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc index 590c61c522bfd1..564b200bed7f0c 100644 --- a/paddle/fluid/operators/data/batch_resize_op.cc +++ b/paddle/fluid/operators/data/batch_resize_op.cc @@ -92,7 +92,7 @@ class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker { "channel_first or channel_last.") .SetDefault("NCHW"); AddComment(R"DOC( - Batch resize images + Resize a batch of input images to given size. )DOC"); } }; diff --git a/paddle/fluid/operators/data/dataloader_op.cc b/paddle/fluid/operators/data/dataloader_op.cc index accf9598c519e4..46ede9d2a8fc7e 100644 --- a/paddle/fluid/operators/data/dataloader_op.cc +++ b/paddle/fluid/operators/data/dataloader_op.cc @@ -63,7 +63,13 @@ class DataLoaderOpMaker : public framework::OpProtoAndCheckerMaker { "The unique hash id used as cache key for " "ExecutorInfoCache"); AddComment(R"DOC( - DataLoader Op + DataLoader OP + This OP runs DataPipeline programs to start up DataPipeline for + multi-thread and multi-stream data loading. For DataPipeline + program construct with :code:`paddle.io.map` and + :code:`paddle.io.data_reader`, which holds independent threads + and streams, so DataLoader Op simply initialize a ParallelExecutor + to run DataPipeline progran once. )DOC"); } }; diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc index 45b41ec9434c69..a09daf5b5e85da 100644 --- a/paddle/fluid/operators/data/file_label_loader_op.cc +++ b/paddle/fluid/operators/data/file_label_loader_op.cc @@ -53,7 +53,12 @@ class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Label", "The output label tensor of ReadFileLoader op"); AddAttr("data_root", "Path of root directory of dataset"); AddComment(R"DOC( -This operator read a file. + This operator read ImageNet format dataset for :attr:`data_root` with + given indices. + There are 2 outputs: + 1. Image: a list of Tensor which holds the image bytes data + 2. Label: a Tensor with shape [N] and dtype as int64, N is the batch + size, which is the length of input indices. )DOC"); } }; diff --git a/paddle/fluid/operators/data/map_op.cc b/paddle/fluid/operators/data/map_op.cc index a79e9f0aa216af..519b97df48407f 100644 --- a/paddle/fluid/operators/data/map_op.cc +++ b/paddle/fluid/operators/data/map_op.cc @@ -108,7 +108,8 @@ class MapOpMaker : public framework::OpProtoAndCheckerMaker { "(list of string)" "output variable names for map program"); AddComment(R"DOC( - Map Op + This OP used to split data loading stages of DataPipeline, the + map function will be run in independent C++ thread and stream. )DOC"); } }; diff --git a/paddle/fluid/operators/data/mirror_normalize_op.cc b/paddle/fluid/operators/data/mirror_normalize_op.cc index 183d9015008f3f..6608d217ab8f47 100644 --- a/paddle/fluid/operators/data/mirror_normalize_op.cc +++ b/paddle/fluid/operators/data/mirror_normalize_op.cc @@ -83,7 +83,8 @@ class MirrorNormalizeOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>("mean", "The mean value to normalize data"); AddAttr>("std", "The stdvalue to normalize data"); AddComment(R"DOC( - Mirror Normalize Operator. + This OP perform horizintal flipping on input Tensor. Mirror is used + to define whether flipping is need in the give sample. )DOC"); } }; From 03783f22ad814ff5b905728cd7288aff0bb873c4 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Apr 2022 15:01:59 +0000 Subject: [PATCH 93/95] add test_data_pipeline dynmic test. test=develop --- python/paddle/fluid/dataloader/ops.py | 2 +- python/paddle/tests/CMakeLists.txt | 3 +- .../tests/test_data_pipeline_dynamic.py | 93 +++++++++++++++++++ ...peline.py => test_data_pipeline_static.py} | 6 +- 4 files changed, 99 insertions(+), 5 deletions(-) create mode 100644 python/paddle/tests/test_data_pipeline_dynamic.py rename python/paddle/tests/{test_data_pipeline.py => test_data_pipeline_static.py} (97%) diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 6a66d6af1a87e3..1645e9742d47ff 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -116,7 +116,7 @@ def flip_normalize(image): """ if _non_static_mode(): - return map_func(inputs) + return map_func(*args, **kwargs) helper = LayerHelper("map", **locals()) diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt index 3cfb36e08bc669..dd4761e58654be 100644 --- a/python/paddle/tests/CMakeLists.txt +++ b/python/paddle/tests/CMakeLists.txt @@ -9,7 +9,8 @@ foreach(TEST_OP ${DIST_TEST_OPS}) endforeach() if (WIN32) - LIST(REMOVE_ITEM TEST_OPS test_data_pipeline) + LIST(REMOVE_ITEM TEST_OPS test_data_pipeline_static) + LIST(REMOVE_ITEM TEST_OPS test_data_pipeline_dynamic) LIST(REMOVE_ITEM TEST_OPS test_ops_file_label_loader) LIST(REMOVE_ITEM TEST_OPS test_ops_decode) LIST(REMOVE_ITEM TEST_OPS test_ops_crop_resize) diff --git a/python/paddle/tests/test_data_pipeline_dynamic.py b/python/paddle/tests/test_data_pipeline_dynamic.py new file mode 100644 index 00000000000000..dfd7ef11ed065c --- /dev/null +++ b/python/paddle/tests/test_data_pipeline_dynamic.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.utils.download import get_path_from_url +from paddle.vision.datasets import DatasetFolder +from paddle.vision.ops import image_decode_random_crop, image_resize, \ + random_flip, mirror_normalize +from paddle.vision.reader import file_label_reader + +import test_data_pipeline_static +from test_data_pipeline_static import DATASET_HOME, DATASET_URL, \ + DATASET_MD5, IMAGE_NUM + +DATASET_HOME = os.path.expanduser("~/.cache/paddle/datasets") +DATASET_URL = "https://paddlemodels.cdn.bcebos.com/ImageNet_stub.tar" +DATASET_MD5 = "c7110519124a433901cf005a4a91b607" +IMAGE_NUM = 100 + + +class TestDataPipelineDynamicCase1( + test_data_pipeline_static.TestDataPipelineStaticCase1): + def test_output(self): + # NOTE: only supoort CUDA kernel currently + if not core.is_compiled_with_cuda(): + return + + data = self.reader() + + image = data['image'].numpy() + assert image.shape[0] == self.batch_size + assert image.shape[1] == 3 + assert image.shape[2] == self.target_size + assert image.shape[3] == self.target_size + assert image.dtype == np.float32 + + restore_image = image * self.std_np + self.mean_np + assert np.all(restore_image > -1.) + assert np.all(restore_image < 256.) + + label = data['label'].numpy() + assert label.shape[0] == self.batch_size + assert label.dtype == np.int64 + assert np.all(label >= 0) + assert np.all(label <= 1) + + +class TestDataPipelineDynamicCase2(TestDataPipelineDynamicCase1): + def setUp(self): + self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, + DATASET_MD5) + + self.num_epoches = 1 + self.batch_size = 16 + self.num_threads = 4 + self.host_memory_padding = 0 + self.device_memory_padding = 0 + + self.shuffle = True + self.drop_last = True + self.calc_iter_info() + + self.target_size = 128 + self.flip_prob = 0.5 + self.mean = [123.675, 116.28, 103.53] + self.std = [58.395, 57.120, 57.375] + + self.mean_np = np.array(self.mean).reshape([1, 3, 1, 1]) + self.std_np = np.array(self.std).reshape([1, 3, 1, 1]) + + self.build_reader() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tests/test_data_pipeline.py b/python/paddle/tests/test_data_pipeline_static.py similarity index 97% rename from python/paddle/tests/test_data_pipeline.py rename to python/paddle/tests/test_data_pipeline_static.py index 15d6310aab3c30..3db5755c953785 100644 --- a/python/paddle/tests/test_data_pipeline.py +++ b/python/paddle/tests/test_data_pipeline_static.py @@ -32,7 +32,7 @@ IMAGE_NUM = 100 -class TestDataPipelineCase1(unittest.TestCase): +class TestDataPipelineStaticCase1(unittest.TestCase): def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) @@ -102,7 +102,7 @@ def flip_normalize(image): self.reader = imagenet_reader - def test_static_output(self): + def test_output(self): # NOTE: only supoort CUDA kernel currently if not core.is_compiled_with_cuda(): return @@ -136,7 +136,7 @@ def test_static_output(self): loader.reset() -class TestDataPipelineCase2(TestDataPipelineCase1): +class TestDataPipelineStaticCase2(TestDataPipelineStaticCase1): def setUp(self): self.data_root = get_path_from_url(DATASET_URL, DATASET_HOME, DATASET_MD5) From 6d20a9fd96fa1dc5700008a63662dd02ab3cff5b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 6 Apr 2022 03:30:22 +0000 Subject: [PATCH 94/95] add NVJPEG error meassage spider to fix ci build. test=develop --- paddle/fluid/operators/data/map_runner.h | 1 - tools/externalError/spider.py | 46 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/data/map_runner.h b/paddle/fluid/operators/data/map_runner.h index c1e23436480512..2d33bdf79a2581 100644 --- a/paddle/fluid/operators/data/map_runner.h +++ b/paddle/fluid/operators/data/map_runner.h @@ -14,7 +14,6 @@ #include #include #include -#include "ThreadPool.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py index e07f05f561cb51..0df857a2d6806a 100644 --- a/tools/externalError/spider.py +++ b/tools/externalError/spider.py @@ -361,6 +361,52 @@ def handle_data(self, data): desc.strip()) CUFFTHTMLParser().feed(html) + #*************************************************************************************************# + + #*********************************** nvJPEG Error Message **************************************# + nvjpegStatus_t = { + "NVJPEG_STATUS_SUCCESS": 0, + "NVJPEG_STATUS_NOT_INITIALIZED": 1, + "NVJPEG_STATUS_INVALID_PARAMETER": 2, + "NVJPEG_STATUS_BAD_JPEG": 3, + "NVJPEG_STATUS_JPEG_NOT_SUPPORTED": 4, + "NVJPEG_STATUS_ALLOCATOR_FAILURE": 5, + "NVJPEG_STATUS_EXECUTION_FAILED": 6, + "NVJPEG_STATUS_ARCH_MISMATCH": 7, + "NVJPEG_STATUS_INTERNAL_ERROR": 8, + "NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED": 9, + "NVJPEG_STATUS_INCOMPLETE_BITSTREAM": 10, + } + + print("start crawling errorMessage for nvidia nvJPEG API--->") + url = 'https://docs.nvidia.com/cuda/nvjpeg/#nvjpeg-api-return-codes' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.NVJPEG + + html = urllib.request.urlopen(url).read().decode('utf-8') + + res_div = r'Description of the returned error codes:.*?
(.*?)
' + m_div = re.findall(res_div, html, re.S | re.M)[0] + + res_dt = r'(.*?).*?colspan="1">(.*?)' + m_dt = re.findall(res_dt, m_div, re.S | re.M) + + for error in m_dt: + m_code = error[0] + m_code = m_code.split()[0].strip() + + m_message = error[1] + m_message = re.sub(r'\t', ' ', m_message) + m_message = re.sub(r'\n +', ' ', m_message) + m_message = re.sub(r'<.*?>', '', m_message) + + _Messages = allMessageDesc.messages.add() + _Messages.code = int(nvjpegStatus_t[m_code]) + _Messages.message = "'%s'. %s" % (m_code, m_message) + + print("End crawling errorMessage for nvidia NVJPEG API!\n") + #*************************************************************************************************# def main(argv): From b4571db871508581d1da9ac4968c7b35b27be421 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 10 Apr 2022 09:31:59 +0000 Subject: [PATCH 95/95] fix ci. test=develop --- paddle/fluid/operators/data/data_reader_op.h | 12 +++--------- paddle/fluid/operators/data/map_runner.cc | 10 ++-------- python/paddle/fluid/dataloader/ops.py | 2 +- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/data/data_reader_op.h b/paddle/fluid/operators/data/data_reader_op.h index 74b03d8a544267..324fa9e1e50f34 100644 --- a/paddle/fluid/operators/data/data_reader_op.h +++ b/paddle/fluid/operators/data/data_reader_op.h @@ -25,13 +25,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/enforce.h" - -#ifdef _WIN32 -static unsigned sleep(unsigned seconds) { - Sleep(seconds * 1000); - return 0; -} -#endif +#include "paddle/fluid/platform/timer.h" namespace paddle { namespace operators { @@ -165,8 +159,8 @@ class DataReader { try { executor.Run(*reader_block_->Program(), &scope_, - static_cast(reader_block_->ID()), false, true, {}, - false, true); + static_cast(reader_block_->ID()), false, true, + output_var_names_, false, true); } catch (...) { break; } diff --git a/paddle/fluid/operators/data/map_runner.cc b/paddle/fluid/operators/data/map_runner.cc index a1e7cf190d4efc..300cfdd246fdcd 100644 --- a/paddle/fluid/operators/data/map_runner.cc +++ b/paddle/fluid/operators/data/map_runner.cc @@ -13,13 +13,7 @@ #include "paddle/fluid/framework/executor_cache.h" #include "paddle/fluid/operators/data/map_runner.h" - -#ifdef _WIN32 -static unsigned sleep(unsigned seconds) { - Sleep(seconds * 1000); - return 0; -} -#endif +#include "paddle/fluid/platform/timer.h" namespace paddle { namespace operators { @@ -138,7 +132,7 @@ void MapRunner::StartMapThread(const Scope* scope) { try { executor.Run(*map_block_->Program(), &scope_, static_cast(map_block_->ID()), false, true, - std::vector(), false, true); + output_var_names_, false, true); } catch (...) { break; } diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py index 1645e9742d47ff..a7a6288483cf3b 100755 --- a/python/paddle/fluid/dataloader/ops.py +++ b/python/paddle/fluid/dataloader/ops.py @@ -18,8 +18,8 @@ from ...fluid import core, framework, Program, program_guard, unique_name from ...fluid.layers.utils import _hash_with_id +from ..layer_helper import LayerHelper from ...fluid.framework import _non_static_mode -from ...common_ops_import import * from collections.abc import Sequence, Mapping