diff --git a/paddle/fluid/operators/data/batch_decode_op.cc b/paddle/fluid/operators/data/batch_decode_op.cc
index d0f8dae02e5d17..648f2033e9a037 100644
--- a/paddle/fluid/operators/data/batch_decode_op.cc
+++ b/paddle/fluid/operators/data/batch_decode_op.cc
@@ -30,8 +30,8 @@ class BatchDecodeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::UINT8,
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        framework::proto::VarType::UINT8, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
@@ -67,7 +67,8 @@ or 1 dimensional Gray Tensor. Optionally converts the image to the
 desired format. The values of the output tensor are uint8 between 0 
 and 255.
 )DOC");
-    AddAttr<int>("num_threads", "Path of the file to be readed.").SetDefault(2);
+    AddAttr<int>("num_threads", "Path of the file to be readed.")
+      .SetDefault(2);
     AddAttr<int>("local_rank",
                  "(int)"
                  "The index of the op to start execution");
@@ -77,13 +78,11 @@ and 255.
                      "decode thread pool");
     AddAttr<int64_t>(
         "host_memory_padding",
-        "(int64, default 0),"
-        "pinned memory allocation padding number for Nvjpeg decoding")
+        "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding")
         .SetDefault(0);
     AddAttr<int64_t>(
         "device_memory_padding",
-        "(int64, default 0),"
-        "device memory allocation padding number for Nvjpeg decoding")
+        "(int64, default 0), device memory allocation padding number for Nvjpeg decoding")
         .SetDefault(0);
   }
 };
diff --git a/paddle/fluid/operators/data/batch_decode_op.cu b/paddle/fluid/operators/data/batch_decode_op.cu
index 0b640bbb3b986f..93b93b9c674f87 100644
--- a/paddle/fluid/operators/data/batch_decode_op.cu
+++ b/paddle/fluid/operators/data/batch_decode_op.cu
@@ -14,7 +14,6 @@
 
 #if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
 
-#include "paddle/fluid/operators/data/batch_decode_op.h"
 #include "paddle/fluid/operators/data/batch_decode_random_crop_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
@@ -22,8 +21,7 @@ namespace paddle {
 namespace operators {
 namespace data {
 
-using LoDTensorBlockingQueueHolder =
-    operators::reader::LoDTensorBlockingQueueHolder;
+using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder;
 
 template <typename T>
 class GPUBatchDecodeKernel : public framework::OpKernel<T> {
@@ -36,12 +34,12 @@ class GPUBatchDecodeKernel : public framework::OpKernel<T> {
     auto device_memory_padding = ctx.Attr<int64_t>("device_memory_padding");
 
     // multi-phrase decode thread pool
-    auto* decode_pool =
-        ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool(
-            program_id, num_threads, local_rank,
-            static_cast<size_t>(host_memory_padding),
-            static_cast<size_t>(device_memory_padding));
-
+    auto* decode_pool = 
+      ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool(
+                          program_id, num_threads, local_rank,
+                          static_cast<size_t>(host_memory_padding),
+                          static_cast<size_t>(device_memory_padding));
+    
     const framework::LoDTensorArray* inputs =
         ctx.Input<framework::LoDTensorArray>("X");
 
@@ -54,11 +52,13 @@ class GPUBatchDecodeKernel : public framework::OpKernel<T> {
       auto* x_data = x.data<T>();
       size_t x_numel = static_cast<size_t>(x.numel());
 
-      ImageDecodeTask task = {.bit_stream = x_data,
-                              .bit_len = x_numel,
-                              .tensor = &out_array[i],
-                              .roi_generator = nullptr,
-                              .place = ctx.GetPlace()};
+      ImageDecodeTask task = {
+        .bit_stream = x_data,
+        .bit_len = x_numel,
+        .tensor = &out_array[i],
+        .roi_generator = nullptr,
+        .place = ctx.GetPlace()
+      };
       decode_pool->AddTask(std::make_shared<ImageDecodeTask>(task));
     }
 
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
index 508802154d25f7..3e981a201fd477 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cc
@@ -23,29 +23,21 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Inputs(X) of DecodeJpeg should not be empty."));
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Outputs(Out) of DecodeJpeg should not be empty."));
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg");
+
     auto aspect_ratio_min = ctx->Attrs().Get<float>("aspect_ratio_min");
     auto aspect_ratio_max = ctx->Attrs().Get<float>("aspect_ratio_max");
-    PADDLE_ENFORCE_GT(
-        aspect_ratio_min, 0.,
-        platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_GT(aspect_ratio_min, 0.,
+          platform::errors::InvalidArgument(
             "aspect_ratio_min should be greater than 0, but received "
-            "%f",
-            aspect_ratio_min));
-    PADDLE_ENFORCE_GT(
-        aspect_ratio_max, 0.,
-        platform::errors::InvalidArgument(
+            "%f", aspect_ratio_min));
+    PADDLE_ENFORCE_GT(aspect_ratio_max, 0.,
+          platform::errors::InvalidArgument(
             "aspect_ratio_max should be greater than 0, but received "
-            "%f",
-            aspect_ratio_max));
-    PADDLE_ENFORCE_GE(
-        aspect_ratio_max, aspect_ratio_min,
-        platform::errors::InvalidArgument(
+            "%f", aspect_ratio_max));
+    PADDLE_ENFORCE_GE(aspect_ratio_max, aspect_ratio_min,
+          platform::errors::InvalidArgument(
             "aspect_ratio_max should be greater than aspect_ratio_min, "
             "but received aspect_ratio_max(%d) < aspect_ratio_min(%d)",
             aspect_ratio_max, aspect_ratio_min));
@@ -53,34 +45,31 @@ class BatchDecodeRandomCropOp : public framework::OperatorWithKernel {
     auto area_min = ctx->Attrs().Get<float>("area_min");
     auto area_max = ctx->Attrs().Get<float>("area_max");
     PADDLE_ENFORCE_GT(area_min, 0.,
-                      platform::errors::InvalidArgument(
-                          "area_minshould be greater than 0, but received "
-                          "%f",
-                          area_min));
+          platform::errors::InvalidArgument(
+            "area_minshould be greater than 0, but received "
+            "%f", area_min));
     PADDLE_ENFORCE_GT(area_max, 0.,
-                      platform::errors::InvalidArgument(
-                          "area_max should be greater than 0, but received "
-                          "%f",
-                          area_max));
+          platform::errors::InvalidArgument(
+            "area_max should be greater than 0, but received "
+            "%f", area_max));
     PADDLE_ENFORCE_GE(area_max, area_min,
-                      platform::errors::InvalidArgument(
-                          "area_max should be greater than area_min, "
-                          "but received area_max(%f) < area_min(%f)",
-                          area_max, area_min));
+          platform::errors::InvalidArgument(
+            "area_max should be greater than area_min, "
+            "but received area_max(%f) < area_min(%f)",
+            area_max, area_min));
 
-    auto num_attempts = ctx->Attrs().Get<int64_t>("num_attempts");
+    auto num_attempts= ctx->Attrs().Get<int64_t>("num_attempts");
     PADDLE_ENFORCE_GT(num_attempts, 0,
-                      platform::errors::InvalidArgument(
-                          "num_attempts should be a positive integerm, but "
-                          "received %d",
-                          num_attempts));
+          platform::errors::InvalidArgument(
+            "num_attempts should be a positive integerm, but "
+            "received %d", num_attempts));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::UINT8,
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        framework::proto::VarType::UINT8, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
@@ -108,9 +97,8 @@ class BatchDecodeRandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X",
              "A one dimensional uint8 tensor containing the raw bytes "
-             "of the JPEG image. It is a tensor with rank 1.")
-        .AsDuplicable();
-    AddOutput("Out", "The output tensor of DecodeJpeg op").AsDuplicable();
+             "of the JPEG image. It is a tensor with rank 1.");
+    AddOutput("Out", "The output tensor of DecodeJpeg op");
     AddComment(R"DOC(
 This operator decodes a JPEG image into a 3 dimensional RGB Tensor 
 or 1 dimensional Gray Tensor. Optionally converts the image to the 
@@ -120,14 +108,15 @@ and 255.
     AddAttr<int>("local_rank",
                  "(int64_t)"
                  "The index of the op to start execution");
-    AddAttr<int>("num_threads", "Path of the file to be readed.").SetDefault(2);
-    AddAttr<int64_t>("host_memory_padding",
-                     "(int64, default 0), pinned memory allocation padding "
-                     "number for Nvjpeg decoding")
+    AddAttr<int>("num_threads", "Path of the file to be readed.")
+      .SetDefault(2);
+    AddAttr<int64_t>(
+        "host_memory_padding",
+        "(int64, default 0), pinned memory allocation padding number for Nvjpeg decoding")
         .SetDefault(0);
-    AddAttr<int64_t>("device_memory_padding",
-                     "(int64, default 0), device memory allocation padding "
-                     "number for Nvjpeg decoding")
+    AddAttr<int64_t>(
+        "device_memory_padding",
+        "(int64, default 0), device memory allocation padding number for Nvjpeg decoding")
         .SetDefault(0);
     AddAttr<std::string>(
         "data_format",
@@ -136,8 +125,8 @@ and 255.
         "Specify that the data format of the input and output data is "
         "channel_first or channel_last.")
         .SetDefault("NCHW");
-    AddAttr<float>("aspect_ratio_min", "").SetDefault(3. / 4.);
-    AddAttr<float>("aspect_ratio_max", "").SetDefault(4. / 3.);
+    AddAttr<float>("aspect_ratio_min", "").SetDefault(3./4.);
+    AddAttr<float>("aspect_ratio_max", "").SetDefault(4./3.);
     AddAttr<float>("area_min", "").SetDefault(0.08);
     AddAttr<float>("area_max", "").SetDefault(1.);
     AddAttr<int64_t>("num_attempts", "").SetDefault(10);
@@ -155,10 +144,8 @@ and 255.
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(
-    batch_decode_random_crop, ops::data::BatchDecodeRandomCropOp,
-    ops::data::BatchDecodeRandomCropOpMaker,
+    batch_decode_random_crop, ops::data::BatchDecodeRandomCropOp, ops::data::BatchDecodeRandomCropOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(batch_decode_random_crop,
-                       ops::data::CPUBatchDecodeRandomCropKernel<uint8_t>)
+REGISTER_OP_CPU_KERNEL(batch_decode_random_crop, ops::data::CPUBatchDecodeRandomCropKernel<uint8_t>)
diff --git a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
index 9e882ac8eac88c..e65bce62e2536b 100644
--- a/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
+++ b/paddle/fluid/operators/data/batch_decode_random_crop_op.cu
@@ -25,8 +25,7 @@ namespace paddle {
 namespace operators {
 namespace data {
 
-using LoDTensorBlockingQueueHolder =
-    operators::reader::LoDTensorBlockingQueueHolder;
+using LoDTensorBlockingQueueHolder = operators::reader::LoDTensorBlockingQueueHolder;
 using DataLayout = framework::DataLayout;
 
 ImageDecoderThreadPool* decode_pool = nullptr;
@@ -42,17 +41,21 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
     auto device_memory_padding = ctx.Attr<int64_t>("device_memory_padding");
 
     // multi-phrase decode thread pool
-    auto* decode_pool =
-        ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool(
-            program_id, num_threads, local_rank,
-            static_cast<size_t>(host_memory_padding),
-            static_cast<size_t>(device_memory_padding));
+    auto* decode_pool = 
+      ImageDecoderThreadPoolManager::Instance()->GetDecoderThreadPool(
+                          program_id, num_threads, local_rank,
+                          static_cast<size_t>(host_memory_padding),
+                          static_cast<size_t>(device_memory_padding));
 
-    auto inputs = ctx.MultiInput<framework::LoDTensor>("X");
-    int batch_size = inputs.size();
+    const framework::LoDTensorArray* inputs =
+        ctx.Input<framework::LoDTensorArray>("X");
+    int batch_size = inputs->size();
 
-    auto out_array = ctx.MultiOutput<framework::LoDTensor>("Out");
+    auto* out = ctx.OutputVar("Out");
     auto dev = platform::CUDAPlace(local_rank);
+    
+    auto& out_array = *out->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(batch_size);
 
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
     const DataLayout data_format =
@@ -72,46 +75,52 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
     AreaRange area_range{area_min, area_max};
 
     auto* generators = GeneratorManager::Instance()->GetGenerators(
-        program_id, batch_size, aspect_ratio_range, area_range);
-
-    for (size_t i = 0; i < inputs.size(); i++) {
-      const framework::LoDTensor* x = inputs.at(i);
-      auto* x_data = x->data<T>();
-      size_t x_numel = static_cast<size_t>(x->numel());
-
-      if (data_format == DataLayout::kNCHW) {
-        ImageDecodeTask task = {.bit_stream = x_data,
-                                .bit_len = x_numel,
-                                .tensor = &temp_array[i],
-                                .roi_generator = generators->at(i).get(),
-                                .place = dev};
+                          program_id, batch_size, aspect_ratio_range,
+                          area_range);
+
+    for (size_t i = 0; i < inputs->size(); i++) {
+      const framework::LoDTensor x = inputs->at(i);
+      auto* x_data = x.data<T>();
+      size_t x_numel = static_cast<size_t>(x.numel());
+      
+      if (data_format == DataLayout::kNCHW){
+        ImageDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &temp_array[i],
+          .roi_generator = generators->at(i).get(),
+          .place = dev
+        };
         decode_pool->AddTask(std::make_shared<ImageDecodeTask>(task));
-      } else {
-        ImageDecodeTask task = {.bit_stream = x_data,
-                                .bit_len = x_numel,
-                                .tensor = out_array[i],
-                                .roi_generator = generators->at(i).get(),
-                                .place = dev};
+      }
+      else{
+        ImageDecodeTask task = {
+          .bit_stream = x_data,
+          .bit_len = x_numel,
+          .tensor = &out_array[i],
+          .roi_generator = generators->at(i).get(),
+          .place = dev
+        };
         decode_pool->AddTask(std::make_shared<ImageDecodeTask>(task));
       }
+      
     }
 
     decode_pool->RunAll(true);
 
-    if (data_format == DataLayout::kNCHW) {
+    if (data_format == DataLayout::kNCHW){
       const auto& dev_ctx = ctx.cuda_device_context();
       phi::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 3> trans;
       std::vector<int> axis = {2, 0, 1};
-      for (size_t i = 0; i < inputs.size(); i++) {
+      for (size_t i = 0; i < inputs->size(); i++) {
         // Do transpose
         const framework::DDim& in_sizes = temp_array[i].dims();
         framework::DDim transposed_input_shape = in_sizes.transpose(axis);
         std::vector<int64_t> transposed_input_shape_ =
             phi::vectorize(transposed_input_shape);
-
-        out_array[i]->Resize(transposed_input_shape);
-        out_array[i]->mutable_data<T>(dev_ctx.GetPlace());
-        trans(dev_ctx, temp_array[i], out_array[i], axis);
+        out_array[i].Resize(transposed_input_shape);
+        out_array[i].mutable_data<T>(dev_ctx.GetPlace());
+        trans(dev_ctx, temp_array[i], &out_array[i], axis);
       }
     }
   }
@@ -122,7 +131,6 @@ class GPUBatchDecodeRandomCropKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(batch_decode_random_crop,
-                        ops::data::GPUBatchDecodeRandomCropKernel<uint8_t>)
+REGISTER_OP_CUDA_KERNEL(batch_decode_random_crop, ops::data::GPUBatchDecodeRandomCropKernel<uint8_t>)
 
 #endif
diff --git a/paddle/fluid/operators/data/batch_resize_op.cc b/paddle/fluid/operators/data/batch_resize_op.cc
index e46f12cb6b23ed..d3fbbfd17f58ad 100644
--- a/paddle/fluid/operators/data/batch_resize_op.cc
+++ b/paddle/fluid/operators/data/batch_resize_op.cc
@@ -24,27 +24,28 @@ class BatchResizeOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Inputs(X) of BatchResize should not be empty."));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BatchResize");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchResize");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "BatchResize");
 
     auto size = ctx->Attrs().Get<std::vector<int64_t>>("size");
     PADDLE_ENFORCE_EQ(size.size(), 2,
                       platform::errors::InvalidArgument(
                           "The length of Attrs(size) should be 2."));
-    PADDLE_ENFORCE_GT(size[0], 0, platform::errors::InvalidArgument(
-                                      "h in Attr(size) of Op(BatchResize) "
-                                      "should be greater than 0."));
-    PADDLE_ENFORCE_GT(size[1], 0, platform::errors::InvalidArgument(
-                                      "w in Attr(size) of Op(BatchResize) "
-                                      "should be greater than 0."));
+    PADDLE_ENFORCE_GT(size[0], 0,
+                      platform::errors::InvalidArgument(
+                          "h in Attr(size) of Op(BatchResize) "
+                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(size[1], 0,
+                      platform::errors::InvalidArgument(
+                          "w in Attr(size) of Op(BatchResize) "
+                          "should be greater than 0."));
   }
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::UINT8,
-                                   ctx.GetPlace());
+    return framework::OpKernelType(
+        framework::proto::VarType::UINT8, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
@@ -61,8 +62,7 @@ class BatchResizeOp : public framework::OperatorWithKernel {
 class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(List[LoDTensor]). A batch of instances to random crop.")
-        .AsDuplicable();
+    AddInput("X", "(LoDTensorArray). A batch of instances to random crop.");
     AddOutput("Out", "(Tensor). The cropped instance batch.");
     AddAttr<std::vector<int64_t>>(
         "size", "expected output size of the crop, for each edge.");
@@ -103,8 +103,10 @@ class BatchResizeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
-    batch_resize, ops::data::BatchResizeOp, ops::data::BatchResizeOpMaker,
+    batch_resize, ops::data::BatchResizeOp,
+    ops::data::BatchResizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(batch_resize, ops::data::BatchResizeCPUKernel<uint8_t>)
+REGISTER_OP_CPU_KERNEL(batch_resize,
+                       ops::data::BatchResizeCPUKernel<uint8_t>)
diff --git a/paddle/fluid/operators/data/batch_resize_op.cu b/paddle/fluid/operators/data/batch_resize_op.cu
index e1164043b8c117..4953e39801d3de 100644
--- a/paddle/fluid/operators/data/batch_resize_op.cu
+++ b/paddle/fluid/operators/data/batch_resize_op.cu
@@ -76,10 +76,10 @@ __global__ void KeNearestNeighborInterpFw(
 template <typename T>
 __global__ void KeBilinearInterpFw(
     const T* in, const size_t in_img_h, const size_t in_img_w,
-    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const float ratio_h, const float ratio_w,
-    const bool align_corners, const int align_mode,
+    const size_t input_h, const size_t input_w, T* out,
+    const size_t out_img_h, const size_t out_img_w, const size_t output_h,
+    const size_t output_w, const size_t num_channels, const float ratio_h,
+    const float ratio_w, const bool align_corners, const int align_mode,
     const DataLayout data_format) {
   int nthreads = output_h * output_w;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -114,8 +114,8 @@ __global__ void KeBilinearInterpFw(
     int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
     float src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
     src_h = src_h > 0 ? src_h : 0;
-    float h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
+    float h1lambda = align_flag ? src_h - in_img_idy
+                            : ratio_h * out_img_idy - in_img_idy;
     float h2lambda = 1.f - h1lambda;
 
     // get input w index with offset
@@ -126,8 +126,8 @@ __global__ void KeBilinearInterpFw(
     int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
     float src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
     src_w = src_w > 0 ? src_w : 0;
-    float w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    float w1lambda = align_flag ? src_w - in_img_idx
+                            : ratio_w * out_img_idx - in_img_idx;
     float w2lambda = 1.f - w1lambda;
 
     if (data_format == DataLayout::kNCHW) {
@@ -135,34 +135,33 @@ __global__ void KeBilinearInterpFw(
                             in_img_idy * in_img_w + in_img_idx];
 
       // bilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          (T)(h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
-              h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
-                          w1lambda * in_pos[h_id * in_img_w + w_id]));
+      out[out_id_h * output_w + out_id_w] = (T)(
+          h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                      w1lambda * in_pos[h_id * in_img_w + w_id]));
     } else {
       const T* in_pos =
           &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
               in_img_idx * num_channels + channel_id];
 
       // bilinear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          (T)(h2lambda * (w2lambda * in_pos[0] +
-                          w1lambda * in_pos[w_id * num_channels]) +
-              h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
-                          w1lambda * in_pos[h_id * in_img_w * num_channels +
-                                            w_id * num_channels]));
+      out[out_id_h * output_w + out_id_w] = (T)(
+          h2lambda *
+              (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) +
+          h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] +
+                      w1lambda * in_pos[h_id * in_img_w * num_channels +
+                                        w_id * num_channels]));
     }
   }
 }
 
 template <typename T>
-static void ResizeFwd(const framework::ExecutionContext& ctx,
-                      const framework::LoDTensor& input,
-                      framework::Tensor* output,
-                      const std::vector<int64_t> out_size,
-                      const std::string interp_method, const bool align_corners,
-                      const int align_mode, const int img_h, const int img_w,
-                      const int c, const DataLayout data_format) {
+static void ResizeFwd(
+    const framework::ExecutionContext& ctx, const framework::LoDTensor& input,
+    framework::Tensor* output, const std::vector<int64_t> out_size,
+    const std::string interp_method, const bool align_corners,
+    const int align_mode, const int img_h, const int img_w, const int c,
+    const DataLayout data_format) {
   auto input_data = input.template data<T>();
   int out_h = static_cast<int>(out_size[0]);
   int out_w = static_cast<int>(out_size[1]);
@@ -202,7 +201,8 @@ static void ResizeFwd(const framework::ExecutionContext& ctx,
     KeBilinearInterpFw<T><<<config.block_per_grid, config.thread_per_block, 0,
                             ctx.cuda_device_context().stream()>>>(
         input_data, img_h, img_w, 1, in_chw, output_data, out_h, out_w, 1,
-        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_format);
+        out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
+        data_format);
   }
 }
 
@@ -214,8 +214,8 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(ctx.GetPlace()), true,
         platform::errors::NotFound("This kernel only runs on GPU device."));
     // get input, output
-    auto x = ctx.MultiInput<framework::LoDTensor>("X");
-    PADDLE_ENFORCE_GT(x.size(), 0,
+    auto* x = ctx.Input<framework::LoDTensorArray>("X");
+    PADDLE_ENFORCE_GT(x->size(), 0,
                       platform::errors::InvalidArgument(
                           "The size of X must be greater than 0."));
     auto* out = ctx.Output<framework::LoDTensor>("Out");
@@ -231,28 +231,30 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    auto* img = x.at(0);
-    int64_t img_c =
-        data_format == DataLayout::kNCHW ? img->dims()[0] : img->dims()[2];
+    auto* img = &x->at(0);
+    int64_t img_c = data_format == DataLayout::kNCHW ? \
+                  img->dims()[0] : img->dims()[2];
 
-    std::vector<int64_t> out_dim = {static_cast<int64_t>(x.size()), size[0],
-                                    size[1], img_c};
+    std::vector<int64_t> out_dim = {static_cast<int64_t>(x->size()),
+                                    size[0], size[1], img_c};
     if (data_format == DataLayout::kNCHW) {
-      out_dim = {static_cast<int64_t>(x.size()), img_c, size[0], size[1]};
+      out_dim = {static_cast<int64_t>(x->size()),
+                                    img_c, size[0], size[1]};
     }
     out->Resize(phi::make_ddim(out_dim));
     out->mutable_data<T>(ctx.GetPlace());
 
     int img_h, img_w, idx_h, idx_w, crop_h, crop_w;
-    for (int i = 0; i < x.size(); i++) {
-      img = x.at(i);
+    for (int i = 0; i < x->size(); i++) {
+      img = &x->at(i);
       img_h =
           data_format == DataLayout::kNCHW ? img->dims()[1] : img->dims()[0];
       img_w =
           data_format == DataLayout::kNCHW ? img->dims()[2] : img->dims()[1];
       auto out_tensor = out->Slice(i, i + 1);
-      ResizeFwd<T>(ctx, *img, &out_tensor, size, interp_method, align_corners,
-                   align_mode, img_h, img_w, img_c, data_format);
+      ResizeFwd<T>(ctx, *img, &out_tensor, size, interp_method,
+                   align_corners, align_mode, img_h, img_w, img_c,
+                   data_format);
     }
   }
 };
@@ -262,5 +264,6 @@ class BatchResizeCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(batch_resize, ops::data::BatchResizeCUDAKernel<uint8_t>,
+REGISTER_OP_CUDA_KERNEL(batch_resize,
+                        ops::data::BatchResizeCUDAKernel<uint8_t>,
                         ops::data::BatchResizeCUDAKernel<float>);
diff --git a/paddle/fluid/operators/data/file_label_loader_op.cc b/paddle/fluid/operators/data/file_label_loader_op.cc
index 530d51ec35d358..3b26438db00d7f 100644
--- a/paddle/fluid/operators/data/file_label_loader_op.cc
+++ b/paddle/fluid/operators/data/file_label_loader_op.cc
@@ -27,9 +27,9 @@ class FileLabelLoaderOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true,
                       platform::errors::InvalidArgument(
                           "Input(Indices) of ReadFileLoaderOp is null."));
-    // PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true,
-    //                   platform::errors::InvalidArgument(
-    //                       "Output(Image) of ReadFileLoaderOp is null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Image"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Image) of ReadFileLoaderOp is null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Label"), true,
                       platform::errors::InvalidArgument(
                           "Output(Label) of ReadFileLoaderOp is null."));
@@ -51,8 +51,7 @@ class FileLabelLoaderOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Indices", "The batch indices of input samples");
-    AddOutput("Image", "The output image tensor of ReadFileLoader op")
-        .AsDuplicable();
+    AddOutput("Image", "The output image tensor of ReadFileLoader op");
     AddOutput("Label", "The output label tensor of ReadFileLoader op");
     AddAttr<std::string>("data_root", "Path of root directory of dataset");
     AddComment(R"DOC(
@@ -72,5 +71,4 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(file_label_loader,
-                       ops::FileLabelLoaderCPUKernel<uint8_t>)
+REGISTER_OP_CPU_KERNEL(file_label_loader, ops::FileLabelLoaderCPUKernel<uint8_t>)
diff --git a/paddle/fluid/operators/data/file_label_loader_op.h b/paddle/fluid/operators/data/file_label_loader_op.h
index 995c410b7966ed..7e6b0a555acafe 100644
--- a/paddle/fluid/operators/data/file_label_loader_op.h
+++ b/paddle/fluid/operators/data/file_label_loader_op.h
@@ -13,19 +13,19 @@
 // limitations under the License.
 
 #pragma once
-#include <dirent.h>
-#include <sys/stat.h>
-#include <cstring>
 #include <fstream>
 #include <string>
+#include <cstring>
 #include <vector>
+#include <sys/stat.h>
+#include <dirent.h>
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 
 namespace paddle {
 namespace operators {
@@ -39,7 +39,8 @@ constexpr char DIR_SEP = '\\';
 constexpr char DIR_SEP = '/';
 #endif
 
-static std::string JoinPath(const std::string path1, const std::string path2) {
+static std::string JoinPath(const std::string path1,
+                            const std::string path2) {
   // empty check
   if (path1.empty()) return path2;
   if (path1.empty()) return path1;
@@ -55,12 +56,11 @@ static std::string JoinPath(const std::string path1, const std::string path2) {
   return path1 + DIR_SEP + path2;
 }
 
-static void ParseFilesAndLabels(
-    const std::string data_root,
-    std::vector<std::pair<std::string, int>>* samples) {
+static void ParseFilesAndLabels(const std::string data_root,
+              std::vector<std::pair<std::string, int>>* samples) {
   auto* dir = opendir(data_root.c_str());
   PADDLE_ENFORCE_NE(dir, nullptr, platform::errors::InvalidArgument(
-                                      "Cannot open directory %s", data_root));
+                      "Cannot open directory %s", data_root));
 
   // Step 1: parse classes info
   std::vector<std::string> classes;
@@ -69,13 +69,13 @@ static void ParseFilesAndLabels(
     if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
       entry = readdir(dir);
       continue;
-    }
+    } 
 
     auto cls_path = JoinPath(data_root, entry->d_name);
     struct stat s;
     int ret = stat(cls_path.c_str(), &s);
     PADDLE_ENFORCE_EQ(ret, 0, platform::errors::InvalidArgument(
-                                  "Directory %s is unaccessiable.", cls_path));
+          "Directory %s is unaccessiable.", cls_path));
 
     if (S_ISDIR(s.st_mode)) classes.emplace_back(entry->d_name);
 
@@ -89,12 +89,13 @@ static void ParseFilesAndLabels(
 
   // Step 2: traverse directory to generate samples
   for (int class_id = 0; class_id < static_cast<int>(classes.size());
-       class_id++) {
-    auto cur_dir = data_root + DIR_SEP + classes[class_id];
+      class_id++) {
+    auto cur_dir = data_root + DIR_SEP + classes[class_id]; 
     dir = opendir(cur_dir.c_str());
     entry = readdir(dir);
     while (entry) {
-      if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
+      if (strcmp(entry->d_name, ".") == 0
+          || strcmp(entry->d_name, "..") == 0) {
         entry = readdir(dir);
         continue;
       }
@@ -106,13 +107,12 @@ static void ParseFilesAndLabels(
     }
     closedir(dir);
   }
+  
 }
 
-std::map<std::string, std::vector<std::pair<std::string, int>>>
-    root_to_samples_;
+std::map<std::string, std::vector<std::pair<std::string, int>>> root_to_samples_;
 
-static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(
-    const std::string data_root) {
+static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(const std::string data_root) {
   auto iter = root_to_samples_.find(data_root);
   if (iter == root_to_samples_.end()) {
     std::vector<std::pair<std::string, int>> samples;
@@ -120,16 +120,16 @@ static std::vector<std::pair<std::string, int>>* GetFilesAndLabelsFromCache(
     VLOG(4) << "Init sample number: " << samples.size();
     root_to_samples_[data_root] = samples;
   }
-
+  
   return &(root_to_samples_[data_root]);
 }
 
 template <typename T>
-class FileLabelLoaderCPUKernel : public framework::OpKernel<T> {
+class FileLabelLoaderCPUKernel: public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* indices = ctx.Input<LoDTensor>("Indices");
-    auto image_arr = ctx.MultiOutput<LoDTensor>("Image");
+    auto* image_arr = ctx.Output<LoDTensorArray>("Image");
     auto* label_tensor = ctx.Output<LoDTensor>("Label");
 
     auto data_root = ctx.Attr<std::string>("data_root");
@@ -138,9 +138,11 @@ class FileLabelLoaderCPUKernel : public framework::OpKernel<T> {
     auto batch_size = indices->dims()[0];
     const int64_t* indices_data = indices->data<int64_t>();
 
-    label_tensor->Resize(phi::make_ddim({static_cast<int64_t>(batch_size)}));
-    auto* label_data =
-        label_tensor->mutable_data<int64_t>(platform::CPUPlace());
+    image_arr->clear();
+    image_arr->reserve(batch_size);
+    label_tensor->Resize(
+        phi::make_ddim({static_cast<int64_t>(batch_size)}));
+    auto* label_data = label_tensor->mutable_data<int64_t>(platform::CPUPlace());
     for (int64_t i = 0; i < batch_size; i++) {
       int64_t index = static_cast<int>(indices_data[i]);
       auto file = samples->at(index).first;
@@ -151,14 +153,15 @@ class FileLabelLoaderCPUKernel : public framework::OpKernel<T> {
 
       input.seekg(0, std::ios::beg);
 
-      auto image = image_arr[i];
+      framework::LoDTensor image;
       std::vector<int64_t> image_len = {file_size};
-      image->Resize(phi::make_ddim(image_len));
+      image.Resize(phi::make_ddim(image_len));
 
-      uint8_t* data = image->mutable_data<uint8_t>(platform::CPUPlace());
+      uint8_t* data = image.mutable_data<uint8_t>(platform::CPUPlace());
 
       input.read(reinterpret_cast<char*>(data), file_size);
 
+      image_arr->emplace_back(image);
       label_data[i] = static_cast<int64_t>(label);
     }
   }
@@ -171,6 +174,7 @@ class FileLabelLoaderCPUKernel : public framework::OpKernel<T> {
     framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
+
 };
 
 }  // namespace data
diff --git a/paddle/fluid/operators/data/image_decoder.cc b/paddle/fluid/operators/data/image_decoder.cc
index 37fec0433f9ca4..4c39030d1ad4f2 100644
--- a/paddle/fluid/operators/data/image_decoder.cc
+++ b/paddle/fluid/operators/data/image_decoder.cc
@@ -18,89 +18,74 @@ namespace paddle {
 namespace operators {
 namespace data {
 
-ImageDecoder::ImageDecoder(int dev_id, size_t host_memory_padding,
-                           size_t device_memory_padding)
-    : nvjpeg_streams_(2), pinned_buffers_(2), page_id_(0) {
+ImageDecoder::ImageDecoder(int dev_id,
+                           size_t host_memory_padding,
+                           size_t device_memory_padding) 
+  : nvjpeg_streams_(2),
+    pinned_buffers_(2),
+    page_id_(0) {
   platform::SetDeviceId(dev_id);
 
   // create nvjpeg handle and stream
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegCreateEx(
-      NVJPEG_BACKEND_HYBRID, &device_allocator_, &pinned_allocator_, 0,
-      &handle_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(
+      platform::dynload::nvjpegCreateEx(NVJPEG_BACKEND_HYBRID, &device_allocator_,
+                           &pinned_allocator_, 0, &handle_));
 
   // set pinned/device memory padding
   if (host_memory_padding > 0) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(
-        platform::dynload::nvjpegSetPinnedMemoryPadding(host_memory_padding,
-                                                        handle_));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetPinnedMemoryPadding(host_memory_padding, handle_));
   }
   if (device_memory_padding > 0) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(
-        platform::dynload::nvjpegSetDeviceMemoryPadding(device_memory_padding,
-                                                        handle_));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegSetDeviceMemoryPadding(device_memory_padding, handle_));
   }
 
   // create nvjpeg stream
   for (size_t i = 0; i < nvjpeg_streams_.size(); i++) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(
-        handle_, &nvjpeg_streams_[i]));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamCreate(handle_, &nvjpeg_streams_[i]));
   }
 
   // create decode params, decoder and state
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(
-      handle_, NVJPEG_BACKEND_HYBRID, &decoder_));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsCreate(handle_, &decode_params_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderCreate(handle_, NVJPEG_BACKEND_HYBRID, &decoder_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderStateCreate(handle_, decoder_, &state_));
 
   // create device & pinned buffer
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(
-      handle_, &device_allocator_, &device_buffer_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceCreate(handle_, &device_allocator_, &device_buffer_));
   for (size_t i = 0; i < pinned_buffers_.size(); i++) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(
-        handle_, &pinned_allocator_, &pinned_buffers_[i]));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedCreate(handle_, &pinned_allocator_, &pinned_buffers_[i]));
   }
 }
 
 ImageDecoder::~ImageDecoder() {
   // destroy nvjpeg streams
   for (size_t i = 0; i < nvjpeg_streams_.size(); i++) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(
-        platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i]));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamDestroy(nvjpeg_streams_[i]));
   }
 
   // destroy decode params, decoder and state
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecodeParamsDestroy(decode_params_));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecoderDestroy(decoder_));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegJpegStateDestroy(state_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsDestroy(decode_params_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecoderDestroy(decoder_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStateDestroy(state_));
 
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferDeviceDestroy(device_buffer_));
   for (size_t i = 0; i < pinned_buffers_.size(); i++) {
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(
-        platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i]));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegBufferPinnedDestroy(pinned_buffers_[i]));
   }
 
   // destroy nvjpeg handle at last
   PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDestroy(handle_));
 }
 
-void ImageDecoder::CPUDecodeRandomCrop(const uint8_t* data, size_t length,
-                                       RandomROIGenerator* roi_generator,
-                                       unsigned char* workspace,
-                                       size_t workspace_size,
-                                       framework::LoDTensor* out,
-                                       platform::Place place) {
+void ImageDecoder::CPUDecodeRandomCrop(
+                        const uint8_t* data, size_t length,
+                        RandomROIGenerator* roi_generator,
+                        unsigned char* workspace, size_t workspace_size,
+                        framework::LoDTensor* out, platform::Place place) {
   VLOG(4) << "CPUDecodeRandomCropResize enter";
 #ifdef PADDLE_WITH_OPENCV
-  cv::Mat image = cv::imdecode(
-      cv::Mat(1, length, CV_8UC1, const_cast<unsigned char*>(data)),
-      cv::IMREAD_COLOR);
-
+  cv::Mat image =
+      cv::imdecode(cv::Mat(1, length, CV_8UC1, const_cast<unsigned char*>(data)), cv::IMREAD_COLOR);
+  
   cv::Mat cropped;
   int height = image.rows;
   int width = image.cols;
@@ -126,8 +111,7 @@ void ImageDecoder::CPUDecodeRandomCrop(const uint8_t* data, size_t length,
   cpu_tensor.Resize(phi::make_ddim(out_shape));
   auto* cpu_data = cpu_tensor.mutable_data<uint8_t>(platform::CPUPlace());
 
-  cv::Mat cpu_mat(height, width, CV_8UC3, const_cast<unsigned char*>(cpu_data),
-                  cv::Mat::AUTO_STEP);
+  cv::Mat cpu_mat(height, width, CV_8UC3, const_cast<unsigned char*>(cpu_data), cv::Mat::AUTO_STEP);
   cv::cvtColor(cropped, cpu_mat, cv::COLOR_BGR2RGB);
 
   // copy cpu tensor to output gpu tensor
@@ -147,24 +131,22 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams(
   int widths[NVJPEG_MAX_COMPONENT];
   int heights[NVJPEG_MAX_COMPONENT];
 
-  nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo(
-      handle_, bit_stream, bit_len, &components, &subsampling, widths, heights);
+  
+  nvjpegStatus_t status = platform::dynload::nvjpegGetImageInfo(handle_, bit_stream, bit_len,
+                         &components, &subsampling, widths, heights);
 
   if (status != NVJPEG_STATUS_SUCCESS) return status;
 
   int64_t width = static_cast<int64_t>(widths[0]);
   int64_t height = static_cast<int64_t>(heights[0]);
 
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_,
-                                                           NVJPEG_OUTPUT_RGBI));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetOutputFormat(decode_params_, NVJPEG_OUTPUT_RGBI));
 
   if (roi_generator) {
     ROI roi;
     roi_generator->GenerateRandomROI(width, height, &roi);
 
-    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(
-        decode_params_, roi.x, roi.y, roi.w, roi.h));
+    PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeParamsSetROI(decode_params_, roi.x, roi.y, roi.w, roi.h));
     height = roi.h;
     width = roi.w;
   }
@@ -180,72 +162,55 @@ nvjpegStatus_t ImageDecoder::ParseDecodeParams(
   return NVJPEG_STATUS_SUCCESS;
 }
 
-nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream,
-                                                 size_t bit_len,
-                                                 nvjpegImage_t* out_image) {
+nvjpegStatus_t ImageDecoder::GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image) {
   auto buffer = pinned_buffers_[page_id_];
   auto stream = nvjpeg_streams_[page_id_];
   page_id_ ^= 1;
 
   // decode jpeg in host to pinned buffer
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(
-      handle_, bit_stream, bit_len, false, false, stream));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer));
-  nvjpegStatus_t status = platform::dynload::nvjpegDecodeJpegHost(
-      handle_, decoder_, state_, decode_params_, stream);
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegJpegStreamParse(handle_, bit_stream, bit_len, false, false, stream));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachPinnedBuffer(state_, buffer));
+  nvjpegStatus_t status = platform::dynload::nvjpegDecodeJpegHost(handle_, decoder_, state_, decode_params_, stream);
   if (status != NVJPEG_STATUS_SUCCESS) return status;
 
   // transfer and decode to device buffer
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_));
-  PADDLE_ENFORCE_NVJPEG_SUCCESS(
-      platform::dynload::nvjpegDecodeJpegTransferToDevice(
-          handle_, decoder_, state_, stream, cuda_stream_));
-  status = platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_,
-                                                     out_image, nullptr);
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegStateAttachDeviceBuffer(state_, device_buffer_));
+  PADDLE_ENFORCE_NVJPEG_SUCCESS(platform::dynload::nvjpegDecodeJpegTransferToDevice(handle_, decoder_, state_, stream, cuda_stream_));
+  status = platform::dynload::nvjpegDecodeJpegDevice(handle_, decoder_, state_, out_image, nullptr);
   return status;
 }
 
-void ImageDecoder::Run(const uint8_t* bit_stream, size_t bit_len,
-                       framework::LoDTensor* out,
-                       RandomROIGenerator* roi_generator,
-                       platform::Place& place) {
+void ImageDecoder::Run(
+    const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out,
+    RandomROIGenerator* roi_generator, platform::Place& place) {
   nvjpegImage_t image;
-
-  nvjpegStatus_t status =
-      ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place);
+  nvjpegStatus_t status = ParseDecodeParams(bit_stream, bit_len, out, roi_generator, &image, place);
   if (status != NVJPEG_STATUS_SUCCESS) {
-    CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out,
-                        place);
+    CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place);
     return;
   }
-
   status = GPUDecodeRandomCrop(bit_stream, bit_len, &image);
   if (status != NVJPEG_STATUS_SUCCESS) {
-    CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out,
-                        place);
+    CPUDecodeRandomCrop(bit_stream, bit_len, roi_generator, nullptr, 0, out, place);
   }
 }
 
 ImageDecoderThreadPool::ImageDecoderThreadPool(
-    const int num_threads, const int dev_id, const size_t host_memory_padding,
-    const size_t device_memory_padding)
-    : threads_(num_threads),
-      dev_id_(dev_id),
-      shutdown_(false),
-      running_(false),
-      completed_(false),
-      outstand_tasks_(0) {
-  PADDLE_ENFORCE_GT(num_threads, 0,
-                    platform::errors::InvalidArgument(
-                        "num_threads shoule be a positive interger, "
-                        "but got %d",
-                        num_threads));
+    const int num_threads, const int dev_id,
+    const size_t host_memory_padding, const size_t device_memory_padding)
+  : threads_(num_threads),
+    dev_id_(dev_id),
+    shutdown_(false),
+    running_(false),
+    completed_(false),
+    outstand_tasks_(0) {
+  PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
+                    "num_threads shoule be a positive interger, "
+                    "but got %d", num_threads));
   for (int i = 0; i < num_threads; i++) {
     threads_.emplace_back(
-        std::thread(std::bind(&ImageDecoderThreadPool::ThreadLoop, this, i,
-                              host_memory_padding, device_memory_padding)));
+        std::thread(std::bind(&ImageDecoderThreadPool::ThreadLoop,
+            this, i, host_memory_padding, device_memory_padding)));
   }
 }
 
@@ -285,29 +250,29 @@ void ImageDecoderThreadPool::ShutDown() {
   task_queue_.clear();
 
   for (auto& thread : threads_) {
-    if (thread.joinable()) thread.join();
+    if (thread.joinable())  thread.join();
   }
 }
 
 void ImageDecoderThreadPool::SortTaskByLengthDescend() {
   std::lock_guard<std::mutex> lock(mutex_);
   std::sort(task_queue_.begin(), task_queue_.end(),
-            [](const std::shared_ptr<ImageDecodeTask> a,
-               const std::shared_ptr<ImageDecodeTask> b) {
-              return b->bit_len < a->bit_len;
-            });
+      [](const std::shared_ptr<ImageDecodeTask> a,
+         const std::shared_ptr<ImageDecodeTask> b) {
+          return b->bit_len < a->bit_len;
+      });
 }
 
-void ImageDecoderThreadPool::ThreadLoop(const int thread_idx,
-                                        const size_t host_memory_padding,
-                                        const size_t device_memory_padding) {
-  ImageDecoder* decoder =
-      new ImageDecoder(dev_id_, host_memory_padding, device_memory_padding);
+void ImageDecoderThreadPool::ThreadLoop(
+      const int thread_idx, const size_t host_memory_padding,
+      const size_t device_memory_padding) {
+  ImageDecoder* decoder = new ImageDecoder(dev_id_,
+                                           host_memory_padding,
+                                           device_memory_padding);
+
   while (!shutdown_) {
     std::unique_lock<std::mutex> lock(mutex_);
-    running_cond_.wait(lock, [this] {
-      return (running_ && !task_queue_.empty()) || shutdown_;
-    });
+    running_cond_.wait(lock, [this] { return (running_ && !task_queue_.empty()) || shutdown_; });
     if (shutdown_) break;
 
     auto task = task_queue_.front();
@@ -329,8 +294,7 @@ void ImageDecoderThreadPool::ThreadLoop(const int thread_idx,
 }
 
 // initialization static variables out of ImageDecoderThreadPoolManager
-ImageDecoderThreadPoolManager* ImageDecoderThreadPoolManager::pm_instance_ptr_ =
-    nullptr;
+ImageDecoderThreadPoolManager* ImageDecoderThreadPoolManager::pm_instance_ptr_ = nullptr;
 std::mutex ImageDecoderThreadPoolManager::m_;
 
 }  // namespace data
diff --git a/paddle/fluid/operators/data/image_decoder.h b/paddle/fluid/operators/data/image_decoder.h
index dd2513e8358abb..de332f0a2b963f 100644
--- a/paddle/fluid/operators/data/image_decoder.h
+++ b/paddle/fluid/operators/data/image_decoder.h
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include <map>
 #include <vector>
+#include <map>
 
 #ifdef PADDLE_WITH_OPENCV
-#include <opencv2/opencv.hpp>
+  #include <opencv2/opencv.hpp>
 #endif
 
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/dynload/nvjpeg.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 
 #include "paddle/fluid/operators/data/random_roi_generator.h"
@@ -33,17 +33,17 @@ namespace paddle {
 namespace operators {
 namespace data {
 
-static int dev_malloc(void** p, size_t s) {
-  return static_cast<int>(cudaMalloc(p, s));
-}
+static int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
 
-static int dev_free(void* p) { return static_cast<int>(cudaFree(p)); }
+static int dev_free(void *p) { return (int)cudaFree(p); }
 
 static int host_malloc(void** p, size_t s, unsigned int f) {
-  return static_cast<int>(cudaHostAlloc(p, s, f));
+  return (int)cudaHostAlloc(p, s, f);
 }
 
-static int host_free(void* p) { return static_cast<int>(cudaFreeHost(p)); }
+static int host_free(void* p) {
+  return (int)cudaFreeHost(p);
+}
 
 struct ImageDecodeTask {
   const uint8_t* bit_stream;
@@ -54,92 +54,92 @@ struct ImageDecodeTask {
 };
 
 class ImageDecoder {
- public:
-  ImageDecoder(int dev_id, size_t host_memory_padding = 0,
-               size_t device_memory_padding = 0);
+  public:
+    ImageDecoder(int dev_id,
+                 size_t host_memory_padding=0,
+                 size_t device_memory_padding=0);
 
-  ~ImageDecoder();
+    ~ImageDecoder();
 
-  void Run(const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out,
-           RandomROIGenerator* roi_generator, const platform::Place& place);
+    void Run(const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out, 
+             RandomROIGenerator* roi_generator, platform::Place& place);
 
- private:
-  DISABLE_COPY_AND_ASSIGN(ImageDecoder);
+  private:
+    DISABLE_COPY_AND_ASSIGN(ImageDecoder);
+
+    void CPUDecodeRandomCrop(const uint8_t* data, size_t length,
+                             RandomROIGenerator* roi_generator,
+                             unsigned char* workspace, size_t workspace_size,
+                             framework::LoDTensor* out, platform::Place place);
 
-  void CPUDecodeRandomCrop(const uint8_t* data, size_t length,
-                           RandomROIGenerator* roi_generator,
-                           unsigned char* workspace, size_t workspace_size,
-                           framework::LoDTensor* out, platform::Place place);
+    nvjpegStatus_t ParseDecodeParams(
+        const uint8_t* bit_stream, size_t bit_len, framework::LoDTensor* out,
+        RandomROIGenerator* roi_generator, nvjpegImage_t* out_image,
+        platform::Place place);
 
-  nvjpegStatus_t ParseDecodeParams(const uint8_t* bit_stream, size_t bit_len,
-                                   framework::LoDTensor* out,
-                                   RandomROIGenerator* roi_generator,
-                                   nvjpegImage_t* out_image,
-                                   platform::Place place);
+    nvjpegStatus_t GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len, nvjpegImage_t* out_image);
 
-  nvjpegStatus_t GPUDecodeRandomCrop(const uint8_t* bit_stream, size_t bit_len,
-                                     nvjpegImage_t* out_image);
 
-  cudaStream_t cuda_stream_ = nullptr;
-  std::vector<nvjpegJpegStream_t> nvjpeg_streams_;
+    cudaStream_t cuda_stream_ = nullptr;
+    std::vector<nvjpegJpegStream_t> nvjpeg_streams_;
 
-  nvjpegHandle_t handle_ = nullptr;
-  nvjpegJpegState_t state_ = nullptr;
-  nvjpegJpegDecoder_t decoder_ = nullptr;
-  nvjpegDecodeParams_t decode_params_ = nullptr;
+    nvjpegHandle_t handle_ = nullptr;
+    nvjpegJpegState_t state_ = nullptr;
+    nvjpegJpegDecoder_t decoder_ = nullptr;
+    nvjpegDecodeParams_t decode_params_ = nullptr;
 
-  nvjpegPinnedAllocator_t pinned_allocator_ = {&host_malloc, &host_free};
-  nvjpegDevAllocator_t device_allocator_ = {&dev_malloc, &dev_free};
-  std::vector<nvjpegBufferPinned_t> pinned_buffers_;
-  nvjpegBufferDevice_t device_buffer_ = nullptr;
+    nvjpegPinnedAllocator_t pinned_allocator_ = {&host_malloc, &host_free};
+    nvjpegDevAllocator_t device_allocator_ = {&dev_malloc, &dev_free};
+    std::vector<nvjpegBufferPinned_t> pinned_buffers_;
+    nvjpegBufferDevice_t device_buffer_ = nullptr;
 
-  int page_id_;
+    int page_id_;
 };
 
 class ImageDecoderThreadPool {
- public:
-  ImageDecoderThreadPool(const int num_threads, const int dev_id,
-                         size_t host_memory_padding,
-                         size_t device_memory_padding);
+  public:
+    ImageDecoderThreadPool(const int num_threads, const int dev_id,
+                           size_t host_memory_padding,
+                           size_t device_memory_padding);
 
-  ~ImageDecoderThreadPool();
+    ~ImageDecoderThreadPool();
 
-  void AddTask(std::shared_ptr<ImageDecodeTask> task);
+    void AddTask(std::shared_ptr<ImageDecodeTask> task);
 
-  void RunAll(const bool wait, const bool sort = true);
+    void RunAll(const bool wait, const bool sort = true);
 
-  void WaitTillTasksCompleted();
+    void WaitTillTasksCompleted();
 
-  void ShutDown();
+    void ShutDown();
 
- private:
-  DISABLE_COPY_AND_ASSIGN(ImageDecoderThreadPool);
+  private:
+    DISABLE_COPY_AND_ASSIGN(ImageDecoderThreadPool);
 
-  void SortTaskByLengthDescend();
+    void SortTaskByLengthDescend();
 
-  void ThreadLoop(const int thread_idx, const size_t host_memory_padding,
-                  const size_t device_memory_padding);
+    void ThreadLoop(const int thread_idx, const size_t host_memory_padding,
+                    const size_t device_memory_padding);
 
-  std::vector<std::thread> threads_;
-  int dev_id_;
+    std::vector<std::thread> threads_;
+    int dev_id_;
 
-  std::deque<std::shared_ptr<ImageDecodeTask>> task_queue_;
-  std::mutex mutex_;
+    std::deque<std::shared_ptr<ImageDecodeTask>> task_queue_;
+    std::mutex mutex_;
 
-  bool shutdown_;
-  std::condition_variable running_cond_;
-  bool running_;
-  std::condition_variable completed_cond_;
-  bool completed_;
+    bool shutdown_;
+    std::condition_variable running_cond_;
+    bool running_;
+    std::condition_variable completed_cond_;
+    bool completed_;
 
-  int outstand_tasks_;
+    int outstand_tasks_;
 };
 
 class ImageDecoderThreadPoolManager {
  private:
   DISABLE_COPY_AND_ASSIGN(ImageDecoderThreadPoolManager);
 
-  static ImageDecoderThreadPoolManager* pm_instance_ptr_;
+  static ImageDecoderThreadPoolManager *pm_instance_ptr_;
   static std::mutex m_;
 
   std::map<int64_t, std::unique_ptr<ImageDecoderThreadPool>> prog_id_to_pool_;
@@ -157,12 +157,15 @@ class ImageDecoderThreadPoolManager {
 
   ImageDecoderThreadPool* GetDecoderThreadPool(
       const int64_t program_id, const int num_threads, const int dev_id,
-      const size_t host_memory_padding, const size_t device_memory_padding) {
+      const size_t host_memory_padding,
+      const size_t device_memory_padding) {
     auto iter = prog_id_to_pool_.find(program_id);
     if (iter == prog_id_to_pool_.end()) {
-      prog_id_to_pool_[program_id] =
-          std::unique_ptr<ImageDecoderThreadPool>(new ImageDecoderThreadPool(
-              num_threads, dev_id, host_memory_padding, device_memory_padding));
+      prog_id_to_pool_[program_id] = 
+        std::unique_ptr<ImageDecoderThreadPool>(
+            new ImageDecoderThreadPool(num_threads, dev_id,
+                                       host_memory_padding,
+                                       device_memory_padding));
     }
     return prog_id_to_pool_[program_id].get();
   }
@@ -177,7 +180,7 @@ class ImageDecoderThreadPoolManager {
 
   void ShutDown() {
     if (prog_id_to_pool_.empty()) return;
-
+    
     std::lock_guard<std::mutex> lk(m_);
     auto iter = prog_id_to_pool_.begin();
     for (; iter != prog_id_to_pool_.end(); iter++) {
diff --git a/python/paddle/fluid/dataloader/ops.py b/python/paddle/fluid/dataloader/ops.py
index 5658852f1836f0..adfb0a8f23e07b 100755
--- a/python/paddle/fluid/dataloader/ops.py
+++ b/python/paddle/fluid/dataloader/ops.py
@@ -21,6 +21,7 @@
 from ...fluid.framework import in_dygraph_mode
 from ...common_ops_import import *
 
+
 __all__ = ["map", "data_reader"]
 
 
@@ -61,26 +62,9 @@ def _generate_stream_id():
 
 
 def map(map_func, inputs=[]):
-    def _build_program_inputs(x, map_block):
-        assert isinstance(x, (list, tuple))
-        assert len(x) > 0, "map function must have inputs"
-        outputs = []
-        if isinstance(x[0], (list, tuple)):
-            for item in x:
-                outputs.append(_build_program_inputs(item, map_block))
-        else:
-            for item in x:
-                outputs.append(
-                    map_block.create_var(
-                        name=unique_name.generate("map_sub"),
-                        type=item.desc.type(),
-                        dtype=item.desc.dtype(),
-                        persistable=False))
-        return outputs
-
     inputs = _to_list(inputs)
     if in_dygraph_mode():
-        return map_func(inputs)
+        return map_func(*inputs)
 
     helper = LayerHelper("map", **locals())
 
@@ -90,19 +74,16 @@ def _build_program_inputs(x, map_block):
         program_id = _hash_with_id(main_program, map_func)
         map_block = main_program.current_block()
 
-        program_inputs = _build_program_inputs(inputs, map_block)
-
+        program_inputs = [
+            map_block.create_var(
+                name=unique_name.generate("map_sub"),
+                type=inp.desc.type(),
+                dtype=inp.desc.dtype(),
+                persistable=False) for inp in inputs]
         program_outputs = map_func(*program_inputs)
         program_outputs = _to_list(program_outputs)
-        input_var_names = []
-        for variables in program_inputs:
-            if isinstance(variables, (list, tuple)):
-                inputs = inputs[0]
-                for v in variables:
-                    input_var_names.append(v.name)
-            else:
-                input_var_names.append(variables.name)
-
+    
+        input_var_names = [v.name for v in program_inputs]
         output_var_names = [v.name for v in program_outputs]
 
     outputs = \
@@ -147,39 +128,22 @@ def data_reader(reader_func,
         reader_block = main_program.current_block()
 
         indices_var = reader_block.create_var(
-            name=unique_name.generate("data_reader_sub"),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype="int64",
-            persistable=False)
+                        name=unique_name.generate("data_reader_sub"),
+                        type=core.VarDesc.VarType.LOD_TENSOR,
+                        dtype="int64",
+                        persistable=False)
         program_outputs = reader_func(indices_var)
         program_outputs = _to_list(program_outputs)
-
+    
         indices_var_name = indices_var.name
-        output_var_names = []
-        for outs in program_outputs:
-            if isinstance(outs, (list, tuple)):
-                for out in outs:
-                    output_var_names.append(out.name)
-            else:
-                output_var_names.append(outs.name)
-
-    outputs = []
-    for outps in program_outputs:
-        if isinstance(outps, (list, tuple)):
-            for outp in outps:
-                outputs.append(
-                    helper.create_variable(
-                        name=unique_name.generate("data_reader"),
-                        type=outp.desc.type(),
-                        dtype=outp.desc.dtype(),
-                        persistable=True))
-        else:
-            outputs.append(
-                helper.create_variable(
-                    name=unique_name.generate("data_reader"),
-                    type=outps.desc.type(),
-                    dtype=outps.desc.dtype(),
-                    persistable=True))
+        output_var_names = [v.name for v in program_outputs]
+
+    outputs = \
+        [helper.create_variable(
+            name=unique_name.generate("data_reader"),
+            type=outp.desc.type(),
+            dtype=outp.desc.dtype(),
+            persistable=True) for outp in program_outputs]
 
     attrs = {
         "reader_id": _hash_with_id(main_program),
@@ -196,6 +160,9 @@ def data_reader(reader_func,
     }
 
     helper.append_op(
-        type="data_reader", inputs={}, outputs={"Out": outputs}, attrs=attrs)
+        type="data_reader",
+        inputs={},
+        outputs={"Out": outputs},
+        attrs=attrs)
 
     return outputs
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index d35a79c964ceb5..7b6fd81a57869f 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -868,10 +868,8 @@ def read_file(filename, name=None):
     return out
 
 
-def image_decode(x,
-                 num_threads=2,
-                 host_memory_padding=0,
-                 device_memory_padding=0,
+def image_decode(x, num_threads=2,
+                 host_memory_padding=0, device_memory_padding=0,
                  name=None):
     """
     Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
@@ -914,18 +912,17 @@ def image_decode(x,
                            core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
         program_id = utils._hash_with_id(mode, num_threads, name, local_rank)
         return _C_ops.batch_decode(
-            x, out, "num_threads", num_threads, "local_rank", local_rank,
-            "program_id", program_id, "host_memory_padding",
-            host_memory_padding, "device_memory_padding", device_memory_padding)
+                x, out, "num_threads", num_threads,
+                "local_rank", local_rank, "program_id", program_id,
+                "host_memory_padding", host_memory_padding,
+                "device_memory_padding", device_memory_padding)
 
     inputs = {'X': x}
-    attrs = {
-        "num_threads": num_threads,
-        "local_rank": local_rank,
-        "program_id": utils._hash_with_id(default_main_program()),
-        "host_memory_padding": host_memory_padding,
-        "device_memory_padding": device_memory_padding
-    }
+    attrs = {"num_threads": num_threads,
+             "local_rank": local_rank,
+             "program_id": utils._hash_with_id(default_main_program()),
+             "host_memory_padding": host_memory_padding,
+             "device_memory_padding": device_memory_padding}
 
     helper = LayerHelper("batch_decode", **locals())
     out = helper.create_variable(
@@ -943,8 +940,8 @@ def image_decode_random_crop(x,
                              host_memory_padding=0,
                              device_memory_padding=0,
                              data_format='NCHW',
-                             aspect_ratio_min=3. / 4.,
-                             aspect_ratio_max=4. / 3.,
+                             aspect_ratio_min=3./4.,
+                             aspect_ratio_max=4./3.,
                              area_min=0.08,
                              area_max=1.,
                              num_attempts=10,
@@ -987,48 +984,39 @@ def image_decode_random_crop(x,
     local_rank = paddle.distributed.get_rank()
     if in_dygraph_mode():
         out = core.VarBase(core.VarDesc.VarType.UINT8, [],
-                           unique_name.generate("image_decode_random_crop"),
-                           core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
+                unique_name.generate("image_decode_random_crop"),
+                core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
         program_id = utils._hash_with_id(mode, num_threads, name, local_rank)
         return _C_ops.batch_decode_random_crop(
-            x, out, "num_threads", num_threads, "data_format", data_format,
-            "aspect_ratio_min", aspect_ratio_min, "aspect_ratio_max",
-            aspect_ratio_max, "area_min", area_min, "area_max", area_max,
-            "num_attempts", num_attempts, "local_rank", local_rank,
-            "program_id", program_id, "host_memory_padding",
-            host_memory_padding, "device_memory_padding", device_memory_padding)
+                x, out, "num_threads", num_threads,
+                "data_format", data_format, "aspect_ratio_min",
+                aspect_ratio_min, "aspect_ratio_max", aspect_ratio_max,
+                "area_min", area_min, "area_max", area_max,
+                "num_attempts", num_attempts, "local_rank", local_rank,
+                "program_id", program_id,
+                "host_memory_padding", host_memory_padding,
+                "device_memory_padding", device_memory_padding)
 
     inputs = {'X': x}
-    attrs = {
-        "num_threads": num_threads,
-        "host_memory_padding": host_memory_padding,
-        "device_memory_padding": device_memory_padding,
-        "data_format": data_format,
-        "aspect_ratio_min": aspect_ratio_min,
-        "aspect_ratio_max": aspect_ratio_max,
-        "area_min": area_min,
-        "area_max": area_max,
-        "num_attempts": num_attempts,
-        "local_rank": local_rank,
-        "program_id": utils._hash_with_id(default_main_program())
-    }
+    attrs = {"num_threads": num_threads,
+             "host_memory_padding": host_memory_padding,
+             "device_memory_padding": device_memory_padding,
+             "data_format": data_format,
+             "aspect_ratio_min": aspect_ratio_min,
+             "aspect_ratio_max": aspect_ratio_max,
+             "area_min": area_min,
+             "area_max": area_max,
+             "num_attempts": num_attempts, 
+             "local_rank": local_rank,
+             "program_id": utils._hash_with_id(default_main_program())}
 
     helper = LayerHelper("batch_decode_random_crop", **locals())
-    # out = helper.create_variable(
-    #     name=unique_name.generate("image_decode_random_crop"),
-    #     type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-    #     dtype=x.dtype)
-    out = [
-        helper.create_variable(
-            name=unique_name.generate("file_label_loader"),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype='uint8') for i in range(len(x))
-    ]
+    out = helper.create_variable(
+        name=unique_name.generate("image_decode_random_crop"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
     helper.append_op(
-        type="batch_decode_random_crop",
-        inputs=inputs,
-        attrs=attrs,
-        outputs={"Out": out})
+        type="batch_decode_random_crop", inputs=inputs, attrs=attrs, outputs={"Out": out})
 
     return out
 
@@ -1037,12 +1025,12 @@ def random_flip(x, prob=0.5, name=None):
     if prob < 0. or prob > 1.:
         raise ValueError("prob should in (0, 1) in random_flip")
 
-    rand_vec = layers.uniform_random_batch_size_like(x, [1, 1], min=0., max=1.)
+    rand_vec = layers.uniform_random_batch_size_like(
+                                    x, [1, 1], min=0., max=1.)
     return rand_vec < prob
 
 
-def mirror_normalize(x,
-                     mirror,
+def mirror_normalize(x, mirror,
                      mean=[123.675, 116.28, 103.53],
                      std=[58.395, 57.120, 57.375],
                      name=None):
@@ -1061,18 +1049,17 @@ def _to_list_3(l):
     std = _to_list_3(std)
 
     if _non_static_mode():
-        return _C_ops.mirror_normalize(x, mirror, "mean", mean, "std", std)
+        return _C_ops.mirror_normalize(x, mirror, "mean", mean,
+                                       "std", std)
 
     helper = LayerHelper("mirror_normalize", **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="mirror_normalize",
-        inputs={"X": x,
-                "Mirror": mirror},
+        inputs={"X": x, "Mirror": mirror},
         outputs={"Out": out},
-        attrs={"mean": mean,
-               "std": std})
+        attrs={"mean": mean, "std": std})
     return out
 
 
@@ -1516,8 +1503,8 @@ def forward(self, x, boxes, boxes_num, aligned=True):
 
 def random_crop_and_resize(x,
                            size,
-                           aspect_ratio_min=3. / 4.,
-                           aspect_ratio_max=4. / 3.,
+                           aspect_ratio_min=3./4.,
+                           aspect_ratio_max=4./3.,
                            area_min=0.08,
                            area_max=1.,
                            num_attempts=10,
@@ -1582,9 +1569,10 @@ def random_crop_and_resize(x,
         out = _C_ops.batch_random_crop_and_resize(
             x, "size", size, "aspect_ratio_min", aspect_ratio_min,
             "aspect_ratio_max", aspect_ratio_max, "area_max", area_max,
-            "area_min", area_min, "num_attempts", num_attempts, "interp_method",
-            interp_method, "align_corners", align_corners, "align_mode",
-            align_mode, "data_format", data_format, "seed", seed)
+            "area_min", area_min, "num_attempts", num_attempts,
+            "interp_method", interp_method, "align_corners",
+            align_corners, "align_mode", align_mode,
+            "data_format", data_format, "seed", seed)
         return out
 
     helper = LayerHelper('batch_random_crop_and_resize', **locals())
@@ -1669,10 +1657,10 @@ def image_resize(x,
         size = (size, size)
 
     if in_dygraph_mode():
-        out = _C_ops.batch_resize(x, "size", size, "interp_method",
-                                  interp_method, "align_corners", align_corners,
-                                  "align_mode", align_mode, "data_format",
-                                  data_format, "seed", seed)
+        out = _C_ops.batch_resize(
+            x, "size", size, "interp_method", interp_method,
+            "align_corners", align_corners, "align_mode",
+            align_mode, "data_format", data_format, "seed", seed)
         return out
 
     helper = LayerHelper('batch_resize', **locals())
@@ -1688,7 +1676,10 @@ def image_resize(x,
         "seed": seed,
     }
     helper.append_op(
-        type="batch_resize", inputs=inputs, outputs={"Out": out}, attrs=attrs)
+        type="batch_resize",
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs=attrs)
     return out
 
 
diff --git a/python/paddle/vision/reader.py b/python/paddle/vision/reader.py
index 7c8c22e2f676b0..4009b787890413 100644
--- a/python/paddle/vision/reader.py
+++ b/python/paddle/vision/reader.py
@@ -22,14 +22,15 @@
 from paddle.common_ops_import import *
 from paddle import _C_ops
 
-__all__ = [  #noqa
+__all__ = [ #noqa
     'file_label_loader',
     'file_label_reader',
 ]
 
 
 class _Sampler(object):
-    def __init__(self, batch_size, num_samples, shuffle=False, drop_last=False):
+    def __init__(self, batch_size, num_samples,
+                 shuffle=False, drop_last=False):
         self.batch_size = batch_size
         self.num_samples = num_samples
         self.shuffle = shuffle
@@ -48,7 +49,7 @@ def __next__(self):
         batch_len = min(self.batch_size, self.num_samples - self.start_idx)
         indices = self.sample_ids[self.start_idx:self.start_idx + batch_len]
         self.start_idx += batch_len
-
+        
         if self.drop_last and len(indices) < self.batch_size:
             self.reset()
             return self.__next__()
@@ -65,16 +66,13 @@ class _SamplerManager(object):
     def __init__(self):
         self.samplers = {}
 
-    def get(self,
-            sample_id,
-            batch_size,
-            num_samples,
-            shuffle=False,
-            drop_last=False):
+    def get(self, sample_id, batch_size, num_samples,
+            shuffle=False, drop_last=False):
         if sample_id in self.samplers:
             return self.samplers[sample_id]
 
-        sampler = _Sampler(batch_size, num_samples, shuffle, drop_last)
+        sampler = _Sampler(batch_size, num_samples,
+                           shuffle, drop_last)
         self.samplers[sample_id] = sampler
         return sampler
 
@@ -82,7 +80,7 @@ def get(self,
 _sampler_manager = _SamplerManager()
 
 
-def file_label_loader(data_root, indices, batch_size, name=None):
+def file_label_loader(data_root, indices, name=None):
     """
     Reads a batch of data, outputs the bytes contents of a file
     as a uint8 Tensor with one dimension.
@@ -96,25 +94,23 @@ def file_label_loader(data_root, indices, batch_size, name=None):
     """
 
     if in_dygraph_mode():
-        image = [
-            core.VarBase(core.VarDesc.VarType.UINT8, [],
-                         unique_name.generate("file_label_loader"),
-                         core.VarDesc.VarType.LOD_TENSOR, False)
-            for i in range(batch_size)
-        ]
-        return _C_ops.file_label_loader(indices, image, 'data_root', data_root)
+        image = core.VarBase(core.VarDesc.VarType.UINT8, [],
+                             unique_name.generate("file_label_loader"),
+                             core.VarDesc.VarType.LOD_TENSOR_ARRAY, False)
+        return _C_ops.file_label_loader(indices, image, 'data_root',
+                                        data_root)
 
     inputs = {"Indices": indices}
-    attrs = {'data_root': data_root, }
+    attrs = {
+        'data_root': data_root,
+    }
 
     helper = LayerHelper("file_label_loader", **locals())
-    image = [
-        helper.create_variable(
-            name=unique_name.generate("file_label_loader"),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype='uint8') for i in range(batch_size)
-    ]
-
+    image = helper.create_variable(
+        name=unique_name.generate("file_label_loader"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype='uint8')
+    
     label = helper.create_variable(
         name=unique_name.generate("file_label_loader"),
         type=core.VarDesc.VarType.LOD_TENSOR,
@@ -163,25 +159,23 @@ def file_label_reader(file_root,
     targets = [s[1] for s in data_folder.samples]
 
     if in_dygraph_mode():
-        sample_id = utils._hash_with_id(file_root, batch_size, shuffle,
-                                        drop_last)
+        sample_id = utils._hash_with_id(file_root, batch_size,
+                                        shuffle, drop_last)
         sampler = _sampler_manager.get(sample_id,
                                        batch_size=batch_size,
                                        num_samples=len(samples),
                                        shuffle=shuffle,
                                        drop_last=drop_last)
         indices = paddle.to_tensor(next(sampler), dtype='int64')
-        outs = file_label_loader(file_root, indices, batch_size)
-        return outs[:-1], outs[-1]
+        return file_label_loader(file_root, indices)
 
     def _reader(indices):
-        return file_label_loader(file_root, indices, batch_size)
-
-    outs = paddle.io.data_reader(
-        _reader,
-        batch_size=batch_size,
-        num_samples=len(samples),
-        shuffle=shuffle,
-        drop_last=drop_last,
-        seed=seed)
-    return outs[:-1], outs[-1]
+        return file_label_loader(file_root, indices)
+
+    return paddle.io.data_reader(_reader,
+                                 batch_size=batch_size,
+                                 num_samples=len(samples),
+                                 shuffle=shuffle,
+                                 drop_last=drop_last,
+                                 seed=seed)
+