feat(init project): init project

153bbbdf · indigo · 6ef4c73c · 153bbbdf · 153bbbdf · 153bbbdf
Commit 153bbbdf authored Apr 27, 2021 by indigo
59 changed files
--- a/.gitignore
+++ b/.gitignore
+ttest
+*.weights
+*.pth
+*.onnx
+*.engine
+*.pyc
+*.infer
+*.npy
+
+z_demo_*
+
+__pycache__
+.idea
+.vscode
+runs
+log
+
+*.jpg
+*.json
+data/outcome
--- a/DeepStream/Readme.md
+++ b/DeepStream/Readme.md
+# This should be run in JetPack 4.4 / JetPack 4.4 G.A. with DeepStream 5.0 / DeepStream 5.0 GA .
+
+1. Compile the custom plugin for Yolo
+2. Convert the ONNX file to TRT with TRTEXEC / TensorRT
+3. Change the model-engine-file in config_infer_primary_yoloV4.txt
+4. In the deepstream_app_config_yoloV4.txt, change 
+          a) source0 : uri=file:<your file> directory. 
+          b) primary-gie : model-engine-file=<your_onnx_engine>
+# Note that for multi-batch, overhead is large owing to NMS is not used.
--- a/DeepStream/config_infer_primary_yoloV4.txt
+++ b/DeepStream/config_infer_primary_yoloV4.txt
+################################################################################
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+################################################################################
+
+# Following properties are mandatory when engine files are not specified:
+#   int8-calib-file(Only in INT8), model-file-format
+#   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
+#   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
+#   ONNX: onnx-file
+#
+# Mandatory properties for detectors:
+#   num-detected-classes
+#
+# Optional properties for detectors:
+#   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
+#   custom-lib-path
+#   parse-bbox-func-name
+#
+# Mandatory properties for classifiers:
+#   classifier-threshold, is-classifier
+#
+# Optional properties for classifiers:
+#   classifier-async-mode(Secondary mode only, Default=false)
+#
+# Optional properties in secondary mode:
+#   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
+#   input-object-min-width, input-object-min-height, input-object-max-width,
+#   input-object-max-height
+#
+# Following properties are always recommended:
+#   batch-size(Default=1)
+#
+# Other optional properties:
+#   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
+#   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
+#   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
+#   custom-lib-path, network-mode(Default=0 i.e FP32)
+#
+# The values in the config file are overridden by values set through GObject
+# properties.
+
+[property]
+gpu-id=0
+net-scale-factor=0.0039215697906911373
+#0=RGB, 1=BGR
+model-color-format=0
+model-engine-file=<onnx_engine_file>
+labelfile-path=labels.txt
+## 0=FP32, 1=INT8, 2=FP16 mode
+network-mode=2
+num-detected-classes=80
+gie-unique-id=1
+network-type=0
+is-classifier=0
+## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
+cluster-mode=2
+maintain-aspect-ratio=1
+parse-bbox-func-name=NvDsInferParseCustomYoloV4
+custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
+engine-create-func-name=NvDsInferYoloCudaEngineGet
+#scaling-filter=0
+#scaling-compute-hw=0
+#output-blob-names=2012
+
+[class-attrs-all]
+nms-iou-threshold=0.2
+pre-cluster-threshold=0.4
--- a/DeepStream/deepstream_app_config_yoloV4.txt
+++ b/DeepStream/deepstream_app_config_yoloV4.txt
+################################################################################
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+################################################################################
+
+[application]
+enable-perf-measurement=1
+perf-measurement-interval-sec=5
+#gie-kitti-output-dir=streamscl
+
+[tiled-display]
+enable=0
+rows=1
+columns=1
+width=1280
+height=720
+gpu-id=0
+#(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
+#(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
+#(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
+#(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
+#(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
+nvbuf-memory-type=0
+
+[source0]
+enable=1
+#Type - 1=CameraV4L2 2=URI 3=MultiURI
+type=3
+uri=file:<Your_file_source>
+
+num-sources=1
+gpu-id=0
+# (0): memtype_device   - Memory type Device
+# (1): memtype_pinned   - Memory type Host Pinned
+# (2): memtype_unified  - Memory type Unified
+cudadec-memtype=0
+
+[sink0]
+enable=1
+#Type - 1=FakeSink 2=EglSink 3=File
+type=2
+sync=1
+source-id=0
+gpu-id=0
+
+[osd]
+enable=1
+gpu-id=0
+border-width=1
+text-size=12
+text-color=1;1;1;1;
+text-bg-color=0.3;0.3;0.3;1
+font=Serif
+show-clock=0
+clock-x-offset=800
+clock-y-offset=820
+clock-text-size=12
+clock-color=1;0;0;0
+nvbuf-memory-type=0
+
+[streammux]
+gpu-id=0
+##Boolean property to inform muxer that sources are live
+live-source=0
+batch-size=1
+##time out in usec, to wait after the first buffer is available
+##to push the batch even if the complete batch is not formed
+batched-push-timeout=40000
+## Set muxer output width and height
+width=1280
+height=720
+##Enable to maintain aspect ratio wrt source, and allow black borders, works
+##along with width, height properties
+enable-padding=0
+nvbuf-memory-type=0
+
+# config-file property is mandatory for any gie section.
+# Other properties are optional and if set will override the properties set in
+# the infer config file.
+[primary-gie]
+enable=1
+gpu-id=0
+model-engine-file=<onnx_engine_file>
+labelfile-path=labels.txt
+#batch-size=1
+#Required by the app for OSD, not a plugin property
+bbox-border-color0=1;0;0;1
+bbox-border-color1=0;1;1;1
+bbox-border-color2=0;0;1;1
+bbox-border-color3=0;1;0;1
+interval=0
+gie-unique-id=1
+nvbuf-memory-type=0
+config-file=config_infer_primary_yoloV4.txt
+
+[sink1]
+enable=1
+type=3
+#1=mp4 2=mkv
+container=1
+#1=h264 2=h265 3=mpeg4
+codec=1
+#encoder type 0=Hardware 1=Software
+enc-type=0
+sync=0
+bitrate=4000000
+#H264 Profile - 0=Baseline 2=Main 4=High
+#H265 Profile - 0=Main 1=Main10
+profile=0
+output-file=fp16_clip1_cam1.mp4
+source-id=0
+
+[tracker]
+enable=1
+# For the case of NvDCF tracker, tracker-width and tracker-height must be a multiple of 32, respectively
+tracker-width=608
+tracker-height=608
+#ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_mot_iou.so
+#ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_nvdcf.so
+ll-lib-file=/opt/nvidia/deepstream/deepstream-5.0/lib/libnvds_mot_klt.so
+#ll-config-file required for IOU only
+#ll-config-file=iou_config.txt
+gpu-id=0
+
+[tests]
+file-loop=0
--- a/DeepStream/labels.txt
+++ b/DeepStream/labels.txt
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+dining table
+toilet
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/Makefile
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/Makefile
+################################################################################
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+################################################################################
+
+CUDA_VER?=
+ifeq ($(CUDA_VER),)
+  $(error "CUDA_VER is not set")
+endif
+CC:= g++
+NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
+
+CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
+CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include
+
+LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
+LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
+
+INCS:= $(wildcard *.h)
+SRCFILES:= nvdsinfer_yolo_engine.cpp \
+           nvdsparsebbox_Yolo.cpp   \
+           yoloPlugins.cpp    \
+           trt_utils.cpp              \
+           yolo.cpp              \
+           kernels.cu
+TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
+
+TARGET_OBJS:= $(SRCFILES:.cpp=.o)
+TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
+
+all: $(TARGET_LIB)
+
+%.o: %.cpp $(INCS) Makefile
+	$(CC) -c -o $@ $(CFLAGS) $<
+
+%.o: %.cu $(INCS) Makefile
+	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
+
+$(TARGET_LIB) : $(TARGET_OBJS)
+	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
+
+clean:
+	rm -rf $(TARGET_LIB)
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/Readme.md
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/Readme.md
+export CUDA_VER=X.Y
+make
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/kernels.cu
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/kernels.cu
+/*
+ * Copyright (c) 2018-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA Corporation is strictly prohibited.
+ *
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
+
+__global__ void gpuYoloLayerV3(const float* input, float* output, const uint gridSize, const uint numOutputClasses,
+                               const uint numBBoxes)
+{
+    uint x_id = blockIdx.x * blockDim.x + threadIdx.x;
+    uint y_id = blockIdx.y * blockDim.y + threadIdx.y;
+    uint z_id = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if ((x_id >= gridSize) || (y_id >= gridSize) || (z_id >= numBBoxes))
+    {
+        return;
+    }
+
+    const int numGridCells = gridSize * gridSize;
+    const int bbindex = y_id * gridSize + x_id;
+
+    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
+        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]);
+
+    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
+        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]);
+
+    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
+        = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]);
+
+    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
+        = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]);
+
+    output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
+        = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
+
+    for (uint i = 0; i < numOutputClasses; ++i)
+    {
+        output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]
+            = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]);
+    }
+}
+
+cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint& batchSize, const uint& gridSize,
+                            const uint& numOutputClasses, const uint& numBBoxes,
+                            uint64_t outputSize, cudaStream_t stream);
+
+cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint& batchSize, const uint& gridSize,
+                            const uint& numOutputClasses, const uint& numBBoxes,
+                            uint64_t outputSize, cudaStream_t stream)
+{
+    dim3 threads_per_block(16, 16, 4);
+    dim3 number_of_blocks((gridSize / threads_per_block.x) + 1,
+                          (gridSize / threads_per_block.y) + 1,
+                          (numBBoxes / threads_per_block.z) + 1);
+    for (unsigned int batch = 0; batch < batchSize; ++batch)
+    {
+        gpuYoloLayerV3<<<number_of_blocks, threads_per_block, 0, stream>>>(
+            reinterpret_cast<const float*>(input) + (batch * outputSize),
+            reinterpret_cast<float*>(output) + (batch * outputSize), gridSize, numOutputClasses,
+            numBBoxes);
+    }
+    return cudaGetLastError();
+}
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/nvdsinfer_yolo_engine.cpp
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvdsinfer_custom_impl.h"
+#include "nvdsinfer_context.h"
+#include "yoloPlugins.h"
+#include "yolo.h"
+
+#include <algorithm>
+
+#define USE_CUDA_ENGINE_GET_API 1
+
+static bool getYoloNetworkInfo (NetworkInfo &networkInfo, const NvDsInferContextInitParams* initParams)
+{
+    std::string yoloCfg = initParams->customNetworkConfigFilePath;
+    std::string yoloType;
+
+    std::transform (yoloCfg.begin(), yoloCfg.end(), yoloCfg.begin(), [] (uint8_t c) {
+        return std::tolower (c);});
+
+    if (yoloCfg.find("yolov2") != std::string::npos) {
+        if (yoloCfg.find("yolov2-tiny") != std::string::npos)
+            yoloType = "yolov2-tiny";
+        else
+            yoloType = "yolov2";
+    } else if (yoloCfg.find("yolov3") != std::string::npos) {
+        if (yoloCfg.find("yolov3-tiny") != std::string::npos)
+            yoloType = "yolov3-tiny";
+        else
+            yoloType = "yolov3";
+    } else {
+        std::cerr << "Yolo type is not defined from config file name:"
+                  << yoloCfg << std::endl;
+        return false;
+    }
+
+    networkInfo.networkType     = yoloType;
+    networkInfo.configFilePath  = initParams->customNetworkConfigFilePath;
+    networkInfo.wtsFilePath     = initParams->modelFilePath;
+    networkInfo.deviceType      = (initParams->useDLA ? "kDLA" : "kGPU");
+    networkInfo.inputBlobName   = "data";
+
+    if (networkInfo.configFilePath.empty() ||
+        networkInfo.wtsFilePath.empty()) {
+        std::cerr << "Yolo config file or weights file is NOT specified."
+                  << std::endl;
+        return false;
+    }
+
+    if (!fileExists(networkInfo.configFilePath) ||
+        !fileExists(networkInfo.wtsFilePath)) {
+        std::cerr << "Yolo config file or weights file is NOT exist."
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+#if !USE_CUDA_ENGINE_GET_API
+IModelParser* NvDsInferCreateModelParser(
+    const NvDsInferContextInitParams* initParams) {
+    NetworkInfo networkInfo;
+    if (!getYoloNetworkInfo(networkInfo, initParams)) {
+      return nullptr;
+    }
+
+    return new Yolo(networkInfo);
+}
+#else
+extern "C"
+bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder,
+        const NvDsInferContextInitParams * const initParams,
+        nvinfer1::DataType dataType,
+        nvinfer1::ICudaEngine *& cudaEngine);
+
+extern "C"
+bool NvDsInferYoloCudaEngineGet(nvinfer1::IBuilder * const builder,
+        const NvDsInferContextInitParams * const initParams,
+        nvinfer1::DataType dataType,
+        nvinfer1::ICudaEngine *& cudaEngine)
+{
+    NetworkInfo networkInfo;
+    if (!getYoloNetworkInfo(networkInfo, initParams)) {
+      return false;
+    }
+
+    Yolo yolo(networkInfo);
+    cudaEngine = yolo.createEngine (builder);
+    if (cudaEngine == nullptr)
+    {
+        std::cerr << "Failed to build cuda engine on "
+                  << networkInfo.configFilePath << std::endl;
+        return false;
+    }
+
+    return true;
+}
+#endif
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include "nvdsinfer_custom_impl.h"
+#include "trt_utils.h"
+
+static const int NUM_CLASSES_YOLO = 80;
+
+extern "C" bool NvDsInferParseCustomYoloV3(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+extern "C" bool NvDsInferParseCustomYoloV3Tiny(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+extern "C" bool NvDsInferParseCustomYoloV2(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+extern "C" bool NvDsInferParseCustomYoloV2Tiny(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+extern "C" bool NvDsInferParseCustomYoloTLT(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+extern "C" bool NvDsInferParseCustomYoloV4(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList);
+
+/* This is a sample bounding box parsing function for the sample YoloV3 detector model */
+static NvDsInferParseObjectInfo convertBBox(const float& bx, const float& by, const float& bw,
+                                     const float& bh, const int& stride, const uint& netW,
+                                     const uint& netH)
+{
+    NvDsInferParseObjectInfo b;
+    // Restore coordinates to network input resolution
+    float xCenter = bx * stride;
+    float yCenter = by * stride;
+    float x0 = xCenter - bw / 2;
+    float y0 = yCenter - bh / 2;
+    float x1 = x0 + bw;
+    float y1 = y0 + bh;
+
+    x0 = clamp(x0, 0, netW);
+    y0 = clamp(y0, 0, netH);
+    x1 = clamp(x1, 0, netW);
+    y1 = clamp(y1, 0, netH);
+
+    b.left = x0;
+    b.width = clamp(x1 - x0, 0, netW);
+    b.top = y0;
+    b.height = clamp(y1 - y0, 0, netH);
+
+    return b;
+}
+
+static void addBBoxProposal(const float bx, const float by, const float bw, const float bh,
+                     const uint stride, const uint& netW, const uint& netH, const int maxIndex,
+                     const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
+{
+    NvDsInferParseObjectInfo bbi = convertBBox(bx, by, bw, bh, stride, netW, netH);
+    if (bbi.width < 1 || bbi.height < 1) return;
+
+    bbi.detectionConfidence = maxProb;
+    bbi.classId = maxIndex;
+    binfo.push_back(bbi);
+}
+
+static std::vector<NvDsInferParseObjectInfo>
+decodeYoloV2Tensor(
+    const float* detections, const std::vector<float> &anchors,
+    const uint gridSizeW, const uint gridSizeH, const uint stride, const uint numBBoxes,
+    const uint numOutputClasses, const uint& netW,
+    const uint& netH)
+{
+    std::vector<NvDsInferParseObjectInfo> binfo;
+    for (uint y = 0; y < gridSizeH; ++y) {
+        for (uint x = 0; x < gridSizeW; ++x) {
+            for (uint b = 0; b < numBBoxes; ++b)
+            {
+                const float pw = anchors[b * 2];
+                const float ph = anchors[b * 2 + 1];
+
+                const int numGridCells = gridSizeH * gridSizeW;
+                const int bbindex = y * gridSizeW + x;
+                const float bx
+                    = x + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
+                const float by
+                    = y + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
+                const float bw
+                    = pw * exp (detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)]);
+                const float bh
+                    = ph * exp (detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)]);
+
+                const float objectness
+                    = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
+
+                float maxProb = 0.0f;
+                int maxIndex = -1;
+
+                for (uint i = 0; i < numOutputClasses; ++i)
+                {
+                    float prob
+                        = (detections[bbindex
+                                      + numGridCells * (b * (5 + numOutputClasses) + (5 + i))]);
+
+                    if (prob > maxProb)
+                    {
+                        maxProb = prob;
+                        maxIndex = i;
+                    }
+                }
+                maxProb = objectness * maxProb;
+
+                addBBoxProposal(bx, by, bw, bh, stride, netW, netH, maxIndex, maxProb, binfo);
+            }
+        }
+    }
+    return binfo;
+}
+
+static std::vector<NvDsInferParseObjectInfo>
+decodeYoloV3Tensor(
+    const float* detections, const std::vector<int> &mask, const std::vector<float> &anchors,
+    const uint gridSizeW, const uint gridSizeH, const uint stride, const uint numBBoxes,
+    const uint numOutputClasses, const uint& netW,
+    const uint& netH)
+{
+    std::vector<NvDsInferParseObjectInfo> binfo;
+    for (uint y = 0; y < gridSizeH; ++y) {
+        for (uint x = 0; x < gridSizeW; ++x) {
+            for (uint b = 0; b < numBBoxes; ++b)
+            {
+                const float pw = anchors[mask[b] * 2];
+                const float ph = anchors[mask[b] * 2 + 1];
+
+                const int numGridCells = gridSizeH * gridSizeW;
+                const int bbindex = y * gridSizeW + x;
+                const float bx
+                    = x + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 0)];
+                const float by
+                    = y + detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 1)];
+                const float bw
+                    = pw * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 2)];
+                const float bh
+                    = ph * detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 3)];
+
+                const float objectness
+                    = detections[bbindex + numGridCells * (b * (5 + numOutputClasses) + 4)];
+
+                float maxProb = 0.0f;
+                int maxIndex = -1;
+
+                for (uint i = 0; i < numOutputClasses; ++i)
+                {
+                    float prob
+                        = (detections[bbindex
+                                      + numGridCells * (b * (5 + numOutputClasses) + (5 + i))]);
+
+                    if (prob > maxProb)
+                    {
+                        maxProb = prob;
+                        maxIndex = i;
+                    }
+                }
+                maxProb = objectness * maxProb;
+
+                addBBoxProposal(bx, by, bw, bh, stride, netW, netH, maxIndex, maxProb, binfo);
+            }
+        }
+    }
+    return binfo;
+}
+
+static inline std::vector<const NvDsInferLayerInfo*>
+SortLayers(const std::vector<NvDsInferLayerInfo> & outputLayersInfo)
+{
+    std::vector<const NvDsInferLayerInfo*> outLayers;
+    for (auto const &layer : outputLayersInfo) {
+        outLayers.push_back (&layer);
+    }
+    std::sort(outLayers.begin(), outLayers.end(),
+        [](const NvDsInferLayerInfo* a, const NvDsInferLayerInfo* b) {
+            return a->inferDims.d[1] < b->inferDims.d[1];
+        });
+    return outLayers;
+}
+
+static bool NvDsInferParseYoloV3(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList,
+    const std::vector<float> &anchors,
+    const std::vector<std::vector<int>> &masks)
+{
+    const uint kNUM_BBOXES = 3;
+
+    const std::vector<const NvDsInferLayerInfo*> sortedLayers =
+        SortLayers (outputLayersInfo);
+
+    if (sortedLayers.size() != masks.size()) {
+        std::cerr << "ERROR: yoloV3 output layer.size: " << sortedLayers.size()
+                  << " does not match mask.size: " << masks.size() << std::endl;
+        return false;
+    }
+
+    if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
+    {
+        std::cerr << "WARNING: Num classes mismatch. Configured:"
+                  << detectionParams.numClassesConfigured
+                  << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
+    }
+
+    std::vector<NvDsInferParseObjectInfo> objects;
+
+    for (uint idx = 0; idx < masks.size(); ++idx) {
+        const NvDsInferLayerInfo &layer = *sortedLayers[idx]; // 255 x Grid x Grid
+
+        assert(layer.inferDims.numDims == 3);
+        const uint gridSizeH = layer.inferDims.d[1];
+        const uint gridSizeW = layer.inferDims.d[2];
+        const uint stride = DIVUP(networkInfo.width, gridSizeW);
+        assert(stride == DIVUP(networkInfo.height, gridSizeH));
+
+        std::vector<NvDsInferParseObjectInfo> outObjs =
+            decodeYoloV3Tensor((const float*)(layer.buffer), masks[idx], anchors, gridSizeW, gridSizeH, stride, kNUM_BBOXES,
+                       NUM_CLASSES_YOLO, networkInfo.width, networkInfo.height);
+        objects.insert(objects.end(), outObjs.begin(), outObjs.end());
+    }
+
+
+    objectList = objects;
+
+    return true;
+}
+
+static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2,
+                                     const float& by2, const uint& netW, const uint& netH)
+{
+    NvDsInferParseObjectInfo b;
+    // Restore coordinates to network input resolution
+
+    float x1 = bx1 * netW;
+    float y1 = by1 * netH;
+    float x2 = bx2 * netW;
+    float y2 = by2 * netH;
+
+    x1 = clamp(x1, 0, netW);
+    y1 = clamp(y1, 0, netH);
+    x2 = clamp(x2, 0, netW);
+    y2 = clamp(y2, 0, netH);
+
+    b.left = x1;
+    b.width = clamp(x2 - x1, 0, netW);
+    b.top = y1;
+    b.height = clamp(y2 - y1, 0, netH);
+
+    return b;
+}
+
+static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh,
+                     const uint& netW, const uint& netH, const int maxIndex,
+                     const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
+{
+    NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH);
+    if (bbi.width < 1 || bbi.height < 1) return;
+
+    bbi.detectionConfidence = maxProb;
+    bbi.classId = maxIndex;
+    binfo.push_back(bbi);
+}
+
+static std::vector<NvDsInferParseObjectInfo>
+decodeYoloV4Tensor(
+    const float* boxes, const float* scores,
+    const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams,
+    const uint& netW, const uint& netH)
+{
+    std::vector<NvDsInferParseObjectInfo> binfo;
+
+    uint bbox_location = 0;
+    uint score_location = 0;
+    for (uint b = 0; b < num_bboxes; ++b)
+    {
+        float bx1 = boxes[bbox_location];
+        float by1 = boxes[bbox_location + 1];
+        float bx2 = boxes[bbox_location + 2];
+        float by2 = boxes[bbox_location + 3];
+
+        float maxProb = 0.0f;
+        int maxIndex = -1;
+
+        for (uint c = 0; c < detectionParams.numClassesConfigured; ++c)
+        {
+            float prob = scores[score_location + c];
+            if (prob > maxProb)
+            {
+                maxProb = prob;
+                maxIndex = c;
+            }
+        }
+
+        if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex])
+        {
+            addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
+        }
+
+        bbox_location += 4;
+        score_location += detectionParams.numClassesConfigured;
+    }
+
+    return binfo;
+}
+
+
+/* C-linkage to prevent name-mangling */
+
+static bool NvDsInferParseYoloV4(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
+    {
+        std::cerr << "WARNING: Num classes mismatch. Configured:"
+                  << detectionParams.numClassesConfigured
+                  << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
+    }
+
+    std::vector<NvDsInferParseObjectInfo> objects;
+    
+    const NvDsInferLayerInfo &boxes = outputLayersInfo[0]; // num_boxes x 4
+    const NvDsInferLayerInfo &scores = outputLayersInfo[1]; // num_boxes x num_classes
+    const NvDsInferLayerInfo &subbox = outputLayersInfo[2];
+    //* printf("%d\n", subbox.inferDims.numDims);
+    // 3 dimensional: [num_boxes, 1, 4]
+    assert(boxes.inferDims.numDims == 3);
+    // 2 dimensional: [num_boxes, num_classes]
+    assert(scores.inferDims.numDims == 2);
+
+    // The second dimension should be num_classes
+    assert(detectionParams.numClassesConfigured == scores.inferDims.d[1]);
+    
+    uint num_bboxes = boxes.inferDims.d[0];
+
+    // std::cout << "Network Info: " << networkInfo.height << "  " << networkInfo.width << std::endl;
+
+    std::vector<NvDsInferParseObjectInfo> outObjs =
+        decodeYoloV4Tensor(
+            (const float*)(boxes.buffer), (const float*)(scores.buffer), num_bboxes, detectionParams,
+            networkInfo.width, networkInfo.height);
+
+    objects.insert(objects.end(), outObjs.begin(), outObjs.end());
+
+    objectList = objects;
+
+    return true;
+}
+
+extern "C" bool NvDsInferParseCustomYoloV4(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    return NvDsInferParseYoloV4 (
+        outputLayersInfo, networkInfo, detectionParams, objectList);
+}
+
+extern "C" bool NvDsInferParseCustomYoloV3(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    static const std::vector<float> kANCHORS = {
+        10.0, 13.0, 16.0,  30.0,  33.0, 23.0,  30.0,  61.0,  62.0,
+        45.0, 59.0, 119.0, 116.0, 90.0, 156.0, 198.0, 373.0, 326.0};
+    static const std::vector<std::vector<int>> kMASKS = {
+        {6, 7, 8},
+        {3, 4, 5},
+        {0, 1, 2}};
+    return NvDsInferParseYoloV3 (
+        outputLayersInfo, networkInfo, detectionParams, objectList,
+        kANCHORS, kMASKS);
+}
+
+extern "C" bool NvDsInferParseCustomYoloV3Tiny(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    static const std::vector<float> kANCHORS = {
+        10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319};
+    static const std::vector<std::vector<int>> kMASKS = {
+        {3, 4, 5},
+        //{0, 1, 2}}; // as per output result, select {1,2,3}
+        {1, 2, 3}};
+
+    return NvDsInferParseYoloV3 (
+        outputLayersInfo, networkInfo, detectionParams, objectList,
+        kANCHORS, kMASKS);
+}
+
+static bool NvDsInferParseYoloV2(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    // copy anchor data from yolov2.cfg file
+    std::vector<float> anchors = {0.57273, 0.677385, 1.87446, 2.06253, 3.33843,
+        5.47434, 7.88282, 3.52778, 9.77052, 9.16828};
+    const uint kNUM_BBOXES = 5;
+
+    if (outputLayersInfo.empty()) {
+        std::cerr << "Could not find output layer in bbox parsing" << std::endl;;
+        return false;
+    }
+    const NvDsInferLayerInfo &layer = outputLayersInfo[0];
+
+    if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
+    {
+        std::cerr << "WARNING: Num classes mismatch. Configured:"
+                  << detectionParams.numClassesConfigured
+                  << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
+    }
+
+    assert(layer.inferDims.numDims == 3);
+    const uint gridSizeH = layer.inferDims.d[1];
+    const uint gridSizeW = layer.inferDims.d[2];
+    const uint stride = DIVUP(networkInfo.width, gridSizeW);
+    assert(stride == DIVUP(networkInfo.height, gridSizeH));
+    for (auto& anchor : anchors) {
+        anchor *= stride;
+    }
+    std::vector<NvDsInferParseObjectInfo> objects =
+        decodeYoloV2Tensor((const float*)(layer.buffer), anchors, gridSizeW, gridSizeH, stride, kNUM_BBOXES,
+                   NUM_CLASSES_YOLO, networkInfo.width, networkInfo.height);
+
+    objectList = objects;
+
+    return true;
+}
+
+extern "C" bool NvDsInferParseCustomYoloV2(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    return NvDsInferParseYoloV2 (
+        outputLayersInfo, networkInfo, detectionParams, objectList);
+}
+
+extern "C" bool NvDsInferParseCustomYoloV2Tiny(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+    return NvDsInferParseYoloV2 (
+        outputLayersInfo, networkInfo, detectionParams, objectList);
+}
+
+extern "C" bool NvDsInferParseCustomYoloTLT(
+    std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+    NvDsInferNetworkInfo const& networkInfo,
+    NvDsInferParseDetectionParams const& detectionParams,
+    std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+
+    if(outputLayersInfo.size() != 4)
+    {
+        std::cerr << "Mismatch in the number of output buffers."
+                  << "Expected 4 output buffers, detected in the network :"
+                  << outputLayersInfo.size() << std::endl;
+        return false;
+    }
+
+    const int topK = 200;
+    const int* keepCount = static_cast <const int*>(outputLayersInfo.at(0).buffer);
+    const float* boxes = static_cast <const float*>(outputLayersInfo.at(1).buffer);
+    const float* scores = static_cast <const float*>(outputLayersInfo.at(2).buffer);
+    const float* cls = static_cast <const float*>(outputLayersInfo.at(3).buffer);
+
+    for (int i = 0; (i < keepCount[0]) && (objectList.size() <= topK); ++i)
+    {
+        const float* loc = &boxes[0] + (i * 4);
+        const float* conf = &scores[0] + i;
+        const float* cls_id = &cls[0] + i;
+
+        if(conf[0] > 1.001)
+            continue;
+
+        if((loc[0] < 0) || (loc[1] < 0) || (loc[2] < 0) || (loc[3] < 0))
+            continue;
+
+        if((loc[0] > networkInfo.width) || (loc[2] > networkInfo.width) || (loc[1] > networkInfo.height) || (loc[3] > networkInfo.width))
+           continue;
+
+        if((loc[2] < loc[0]) || (loc[3] < loc[1]))
+            continue;
+
+        if(((loc[3] - loc[1]) > networkInfo.height) || ((loc[2]-loc[0]) > networkInfo.width))
+            continue;
+
+        NvDsInferParseObjectInfo curObj{static_cast<unsigned int>(cls_id[0]),
+                                        loc[0],loc[1],(loc[2]-loc[0]),
+                                        (loc[3]-loc[1]), conf[0]};
+        objectList.push_back(curObj);
+
+    }
+
+    return true;
+}
+
+/* Check that the custom function has been defined correctly */
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4);
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV3);
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV3Tiny);
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV2);
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV2Tiny);
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloTLT);
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/trt_utils.cpp
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/trt_utils.cpp
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "trt_utils.h"
+
+#include <experimental/filesystem>
+#include <fstream>
+#include <iomanip>
+#include <functional>
+#include <algorithm>
+#include <math.h>
+
+#include "NvInferPlugin.h"
+
+static void leftTrim(std::string& s)
+{
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { return !isspace(ch); }));
+}
+
+static void rightTrim(std::string& s)
+{
+    s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { return !isspace(ch); }).base(), s.end());
+}
+
+std::string trim(std::string s)
+{
+    leftTrim(s);
+    rightTrim(s);
+    return s;
+}
+
+float clamp(const float val, const float minVal, const float maxVal)
+{
+    assert(minVal <= maxVal);
+    return std::min(maxVal, std::max(minVal, val));
+}
+
+bool fileExists(const std::string fileName, bool verbose)
+{
+    if (!std::experimental::filesystem::exists(std::experimental::filesystem::path(fileName)))
+    {
+        if (verbose) std::cout << "File does not exist : " << fileName << std::endl;
+        return false;
+    }
+    return true;
+}
+
+std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType)
+{
+    assert(fileExists(weightsFilePath));
+    std::cout << "Loading pre-trained weights..." << std::endl;
+    std::ifstream file(weightsFilePath, std::ios_base::binary);
+    assert(file.good());
+    std::string line;
+
+    if (networkType == "yolov2")
+    {
+        // Remove 4 int32 bytes of data from the stream belonging to the header
+        file.ignore(4 * 4);
+    }
+    else if ((networkType == "yolov3") || (networkType == "yolov3-tiny")
+             || (networkType == "yolov2-tiny"))
+    {
+        // Remove 5 int32 bytes of data from the stream belonging to the header
+        file.ignore(4 * 5);
+    }
+    else
+    {
+        std::cout << "Invalid network type" << std::endl;
+        assert(0);
+    }
+
+    std::vector<float> weights;
+    char floatWeight[4];
+    while (!file.eof())
+    {
+        file.read(floatWeight, 4);
+        assert(file.gcount() == 4);
+        weights.push_back(*reinterpret_cast<float*>(floatWeight));
+        if (file.peek() == std::istream::traits_type::eof()) break;
+    }
+    std::cout << "Loading weights of " << networkType << " complete!"
+              << std::endl;
+    std::cout << "Total Number of weights read : " << weights.size() << std::endl;
+    return weights;
+}
+
+std::string dimsToString(const nvinfer1::Dims d)
+{
+    std::stringstream s;
+    assert(d.nbDims >= 1);
+    for (int i = 0; i < d.nbDims - 1; ++i)
+    {
+        s << std::setw(4) << d.d[i] << " x";
+    }
+    s << std::setw(4) << d.d[d.nbDims - 1];
+
+    return s.str();
+}
+
+void displayDimType(const nvinfer1::Dims d)
+{
+    std::cout << "(" << d.nbDims << ") ";
+    for (int i = 0; i < d.nbDims; ++i)
+    {
+        switch (d.type[i])
+        {
+        case nvinfer1::DimensionType::kSPATIAL: std::cout << "kSPATIAL "; break;
+        case nvinfer1::DimensionType::kCHANNEL: std::cout << "kCHANNEL "; break;
+        case nvinfer1::DimensionType::kINDEX: std::cout << "kINDEX "; break;
+        case nvinfer1::DimensionType::kSEQUENCE: std::cout << "kSEQUENCE "; break;
+        }
+    }
+    std::cout << std::endl;
+}
+
+int getNumChannels(nvinfer1::ITensor* t)
+{
+    nvinfer1::Dims d = t->getDimensions();
+    assert(d.nbDims == 3);
+
+    return d.d[0];
+}
+
+uint64_t get3DTensorVolume(nvinfer1::Dims inputDims)
+{
+    assert(inputDims.nbDims == 3);
+    return inputDims.d[0] * inputDims.d[1] * inputDims.d[2];
+}
+
+nvinfer1::ILayer* netAddMaxpool(int layerIdx, std::map<std::string, std::string>& block,
+                                nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network)
+{
+    assert(block.at("type") == "maxpool");
+    assert(block.find("size") != block.end());
+    assert(block.find("stride") != block.end());
+
+    int size = std::stoi(block.at("size"));
+    int stride = std::stoi(block.at("stride"));
+
+    nvinfer1::IPoolingLayer* pool
+        = network->addPooling(*input, nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{size, size});
+    assert(pool);
+    std::string maxpoolLayerName = "maxpool_" + std::to_string(layerIdx);
+    pool->setStride(nvinfer1::DimsHW{stride, stride});
+    pool->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+    pool->setName(maxpoolLayerName.c_str());
+
+    return pool;
+}
+
+nvinfer1::ILayer* netAddConvLinear(int layerIdx, std::map<std::string, std::string>& block,
+                                   std::vector<float>& weights,
+                                   std::vector<nvinfer1::Weights>& trtWeights, int& weightPtr,
+                                   int& inputChannels, nvinfer1::ITensor* input,
+                                   nvinfer1::INetworkDefinition* network)
+{
+    assert(block.at("type") == "convolutional");
+    assert(block.find("batch_normalize") == block.end());
+    assert(block.at("activation") == "linear");
+    assert(block.find("filters") != block.end());
+    assert(block.find("pad") != block.end());
+    assert(block.find("size") != block.end());
+    assert(block.find("stride") != block.end());
+
+    int filters = std::stoi(block.at("filters"));
+    int padding = std::stoi(block.at("pad"));
+    int kernelSize = std::stoi(block.at("size"));
+    int stride = std::stoi(block.at("stride"));
+    int pad;
+    if (padding)
+        pad = (kernelSize - 1) / 2;
+    else
+        pad = 0;
+    // load the convolution layer bias
+    nvinfer1::Weights convBias{nvinfer1::DataType::kFLOAT, nullptr, filters};
+    float* val = new float[filters];
+    for (int i = 0; i < filters; ++i)
+    {
+        val[i] = weights[weightPtr];
+        weightPtr++;
+    }
+    convBias.values = val;
+    trtWeights.push_back(convBias);
+    // load the convolutional layer weights
+    int size = filters * inputChannels * kernelSize * kernelSize;
+    nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, size};
+    val = new float[size];
+    for (int i = 0; i < size; ++i)
+    {
+        val[i] = weights[weightPtr];
+        weightPtr++;
+    }
+    convWt.values = val;
+    trtWeights.push_back(convWt);
+    nvinfer1::IConvolutionLayer* conv = network->addConvolution(
+        *input, filters, nvinfer1::DimsHW{kernelSize, kernelSize}, convWt, convBias);
+    assert(conv != nullptr);
+    std::string convLayerName = "conv_" + std::to_string(layerIdx);
+    conv->setName(convLayerName.c_str());
+    conv->setStride(nvinfer1::DimsHW{stride, stride});
+    conv->setPadding(nvinfer1::DimsHW{pad, pad});
+
+    return conv;
+}
+
+nvinfer1::ILayer* netAddConvBNLeaky(int layerIdx, std::map<std::string, std::string>& block,
+                                    std::vector<float>& weights,
+                                    std::vector<nvinfer1::Weights>& trtWeights, int& weightPtr,
+                                    int& inputChannels, nvinfer1::ITensor* input,
+                                    nvinfer1::INetworkDefinition* network)
+{
+    assert(block.at("type") == "convolutional");
+    assert(block.find("batch_normalize") != block.end());
+    assert(block.at("batch_normalize") == "1");
+    assert(block.at("activation") == "leaky");
+    assert(block.find("filters") != block.end());
+    assert(block.find("pad") != block.end());
+    assert(block.find("size") != block.end());
+    assert(block.find("stride") != block.end());
+
+    bool batchNormalize, bias;
+    if (block.find("batch_normalize") != block.end())
+    {
+        batchNormalize = (block.at("batch_normalize") == "1");
+        bias = false;
+    }
+    else
+    {
+        batchNormalize = false;
+        bias = true;
+    }
+    // all conv_bn_leaky layers assume bias is false
+    assert(batchNormalize == true && bias == false);
+    UNUSED(batchNormalize);
+    UNUSED(bias);
+
+    int filters = std::stoi(block.at("filters"));
+    int padding = std::stoi(block.at("pad"));
+    int kernelSize = std::stoi(block.at("size"));
+    int stride = std::stoi(block.at("stride"));
+    int pad;
+    if (padding)
+        pad = (kernelSize - 1) / 2;
+    else
+        pad = 0;
+
+    /***** CONVOLUTION LAYER *****/
+    /*****************************/
+    // batch norm weights are before the conv layer
+    // load BN biases (bn_biases)
+    std::vector<float> bnBiases;
+    for (int i = 0; i < filters; ++i)
+    {
+        bnBiases.push_back(weights[weightPtr]);
+        weightPtr++;
+    }
+    // load BN weights
+    std::vector<float> bnWeights;
+    for (int i = 0; i < filters; ++i)
+    {
+        bnWeights.push_back(weights[weightPtr]);
+        weightPtr++;
+    }
+    // load BN running_mean
+    std::vector<float> bnRunningMean;
+    for (int i = 0; i < filters; ++i)
+    {
+        bnRunningMean.push_back(weights[weightPtr]);
+        weightPtr++;
+    }
+    // load BN running_var
+    std::vector<float> bnRunningVar;
+    for (int i = 0; i < filters; ++i)
+    {
+        // 1e-05 for numerical stability
+        bnRunningVar.push_back(sqrt(weights[weightPtr] + 1.0e-5));
+        weightPtr++;
+    }
+    // load Conv layer weights (GKCRS)
+    int size = filters * inputChannels * kernelSize * kernelSize;
+    nvinfer1::Weights convWt{nvinfer1::DataType::kFLOAT, nullptr, size};
+    float* val = new float[size];
+    for (int i = 0; i < size; ++i)
+    {
+        val[i] = weights[weightPtr];
+        weightPtr++;
+    }
+    convWt.values = val;
+    trtWeights.push_back(convWt);
+    nvinfer1::Weights convBias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    trtWeights.push_back(convBias);
+    nvinfer1::IConvolutionLayer* conv = network->addConvolution(
+        *input, filters, nvinfer1::DimsHW{kernelSize, kernelSize}, convWt, convBias);
+    assert(conv != nullptr);
+    std::string convLayerName = "conv_" + std::to_string(layerIdx);
+    conv->setName(convLayerName.c_str());
+    conv->setStride(nvinfer1::DimsHW{stride, stride});
+    conv->setPadding(nvinfer1::DimsHW{pad, pad});
+
+    /***** BATCHNORM LAYER *****/
+    /***************************/
+    size = filters;
+    // create the weights
+    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, size};
+    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, nullptr, size};
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, size};
+    float* shiftWt = new float[size];
+    for (int i = 0; i < size; ++i)
+    {
+        shiftWt[i]
+            = bnBiases.at(i) - ((bnRunningMean.at(i) * bnWeights.at(i)) / bnRunningVar.at(i));
+    }
+    shift.values = shiftWt;
+    float* scaleWt = new float[size];
+    for (int i = 0; i < size; ++i)
+    {
+        scaleWt[i] = bnWeights.at(i) / bnRunningVar[i];
+    }
+    scale.values = scaleWt;
+    float* powerWt = new float[size];
+    for (int i = 0; i < size; ++i)
+    {
+        powerWt[i] = 1.0;
+    }
+    power.values = powerWt;
+    trtWeights.push_back(shift);
+    trtWeights.push_back(scale);
+    trtWeights.push_back(power);
+    // Add the batch norm layers
+    nvinfer1::IScaleLayer* bn = network->addScale(
+        *conv->getOutput(0), nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
+    assert(bn != nullptr);
+    std::string bnLayerName = "batch_norm_" + std::to_string(layerIdx);
+    bn->setName(bnLayerName.c_str());
+    /***** ACTIVATION LAYER *****/
+    /****************************/
+    nvinfer1::ITensor* bnOutput = bn->getOutput(0);
+    nvinfer1::IActivationLayer* leaky = network->addActivation(
+        *bnOutput, nvinfer1::ActivationType::kLEAKY_RELU);
+    leaky->setAlpha(0.1);
+    assert(leaky != nullptr);
+    std::string leakyLayerName = "leaky_" + std::to_string(layerIdx);
+    leaky->setName(leakyLayerName.c_str());
+
+    return leaky;
+}
+
+nvinfer1::ILayer* netAddUpsample(int layerIdx, std::map<std::string, std::string>& block,
+                                 std::vector<float>& weights,
+                                 std::vector<nvinfer1::Weights>& trtWeights, int& inputChannels,
+                                 nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network)
+{
+    assert(block.at("type") == "upsample");
+    nvinfer1::Dims inpDims = input->getDimensions();
+    assert(inpDims.nbDims == 3);
+    assert(inpDims.d[1] == inpDims.d[2]);
+    int h = inpDims.d[1];
+    int w = inpDims.d[2];
+    int stride = std::stoi(block.at("stride"));
+    // add pre multiply matrix as a constant
+    nvinfer1::Dims preDims{3,
+                           {1, stride * h, w},
+                           {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
+                            nvinfer1::DimensionType::kSPATIAL}};
+    int size = stride * h * w;
+    nvinfer1::Weights preMul{nvinfer1::DataType::kFLOAT, nullptr, size};
+    float* preWt = new float[size];
+    /* (2*h * w)
+    [ [1, 0, ..., 0],
+      [1, 0, ..., 0],
+      [0, 1, ..., 0],
+      [0, 1, ..., 0],
+      ...,
+      ...,
+      [0, 0, ..., 1],
+      [0, 0, ..., 1] ]
+    */
+    for (int i = 0, idx = 0; i < h; ++i)
+    {
+        for (int s = 0; s < stride; ++s)
+        {
+            for (int j = 0; j < w; ++j, ++idx)
+            {
+                preWt[idx] = (i == j) ? 1.0 : 0.0;
+            }
+        }
+    }
+    preMul.values = preWt;
+    trtWeights.push_back(preMul);
+    nvinfer1::IConstantLayer* preM = network->addConstant(preDims, preMul);
+    assert(preM != nullptr);
+    std::string preLayerName = "preMul_" + std::to_string(layerIdx);
+    preM->setName(preLayerName.c_str());
+    // add post multiply matrix as a constant
+    nvinfer1::Dims postDims{3,
+                            {1, h, stride * w},
+                            {nvinfer1::DimensionType::kCHANNEL, nvinfer1::DimensionType::kSPATIAL,
+                             nvinfer1::DimensionType::kSPATIAL}};
+    size = stride * h * w;
+    nvinfer1::Weights postMul{nvinfer1::DataType::kFLOAT, nullptr, size};
+    float* postWt = new float[size];
+    /* (h * 2*w)
+    [ [1, 1, 0, 0, ..., 0, 0],
+      [0, 0, 1, 1, ..., 0, 0],
+      ...,
+      ...,
+      [0, 0, 0, 0, ..., 1, 1] ]
+    */
+    for (int i = 0, idx = 0; i < h; ++i)
+    {
+        for (int j = 0; j < stride * w; ++j, ++idx)
+        {
+            postWt[idx] = (j / stride == i) ? 1.0 : 0.0;
+        }
+    }
+    postMul.values = postWt;
+    trtWeights.push_back(postMul);
+    nvinfer1::IConstantLayer* post_m = network->addConstant(postDims, postMul);
+    assert(post_m != nullptr);
+    std::string postLayerName = "postMul_" + std::to_string(layerIdx);
+    post_m->setName(postLayerName.c_str());
+    // add matrix multiply layers for upsampling
+    nvinfer1::IMatrixMultiplyLayer* mm1
+        = network->addMatrixMultiply(*preM->getOutput(0), nvinfer1::MatrixOperation::kNONE, *input,
+                                     nvinfer1::MatrixOperation::kNONE);
+    assert(mm1 != nullptr);
+    std::string mm1LayerName = "mm1_" + std::to_string(layerIdx);
+    mm1->setName(mm1LayerName.c_str());
+    nvinfer1::IMatrixMultiplyLayer* mm2
+        = network->addMatrixMultiply(*mm1->getOutput(0), nvinfer1::MatrixOperation::kNONE,
+                                     *post_m->getOutput(0), nvinfer1::MatrixOperation::kNONE);
+    assert(mm2 != nullptr);
+    std::string mm2LayerName = "mm2_" + std::to_string(layerIdx);
+    mm2->setName(mm2LayerName.c_str());
+    return mm2;
+}
+
+void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
+                    std::string layerOutput, std::string weightPtr)
+{
+    std::cout << std::setw(6) << std::left << layerIndex << std::setw(15) << std::left << layerName;
+    std::cout << std::setw(20) << std::left << layerInput << std::setw(20) << std::left
+              << layerOutput;
+    std::cout << std::setw(6) << std::left << weightPtr << std::endl;
+}
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/trt_utils.h
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/trt_utils.h
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef __TRT_UTILS_H__
+#define __TRT_UTILS_H__
+
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include <fstream>
+
+#include "NvInfer.h"
+
+#define UNUSED(expr) (void)(expr)
+#define DIVUP(n, d) ((n) + (d)-1) / (d)
+
+std::string trim(std::string s);
+float clamp(const float val, const float minVal, const float maxVal);
+bool fileExists(const std::string fileName, bool verbose = true);
+std::vector<float> loadWeights(const std::string weightsFilePath, const std::string& networkType);
+std::string dimsToString(const nvinfer1::Dims d);
+void displayDimType(const nvinfer1::Dims d);
+int getNumChannels(nvinfer1::ITensor* t);
+uint64_t get3DTensorVolume(nvinfer1::Dims inputDims);
+
+// Helper functions to create yolo engine
+nvinfer1::ILayer* netAddMaxpool(int layerIdx, std::map<std::string, std::string>& block,
+                                nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network);
+nvinfer1::ILayer* netAddConvLinear(int layerIdx, std::map<std::string, std::string>& block,
+                                   std::vector<float>& weights,
+                                   std::vector<nvinfer1::Weights>& trtWeights, int& weightPtr,
+                                   int& inputChannels, nvinfer1::ITensor* input,
+                                   nvinfer1::INetworkDefinition* network);
+nvinfer1::ILayer* netAddConvBNLeaky(int layerIdx, std::map<std::string, std::string>& block,
+                                    std::vector<float>& weights,
+                                    std::vector<nvinfer1::Weights>& trtWeights, int& weightPtr,
+                                    int& inputChannels, nvinfer1::ITensor* input,
+                                    nvinfer1::INetworkDefinition* network);
+nvinfer1::ILayer* netAddUpsample(int layerIdx, std::map<std::string, std::string>& block,
+                                 std::vector<float>& weights,
+                                 std::vector<nvinfer1::Weights>& trtWeights, int& inputChannels,
+                                 nvinfer1::ITensor* input, nvinfer1::INetworkDefinition* network);
+void printLayerInfo(std::string layerIndex, std::string layerName, std::string layerInput,
+                    std::string layerOutput, std::string weightPtr);
+
+#endif
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/yolo.cpp
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/yolo.cpp
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "yolo.h"
+#include "yoloPlugins.h"
+
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+
+Yolo::Yolo(const NetworkInfo& networkInfo)
+    : m_NetworkType(networkInfo.networkType), // yolov3
+      m_ConfigFilePath(networkInfo.configFilePath), // yolov3.cfg
+      m_WtsFilePath(networkInfo.wtsFilePath), // yolov3.weights
+      m_DeviceType(networkInfo.deviceType), // kDLA, kGPU
+      m_InputBlobName(networkInfo.inputBlobName), // data
+      m_InputH(0),
+      m_InputW(0),
+      m_InputC(0),
+      m_InputSize(0)
+{}
+
+Yolo::~Yolo()
+{
+    destroyNetworkUtils();
+}
+
+nvinfer1::ICudaEngine *Yolo::createEngine (nvinfer1::IBuilder* builder)
+{
+    assert (builder);
+
+    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
+    std::vector<nvinfer1::Weights> trtWeights;
+
+    nvinfer1::INetworkDefinition *network = builder->createNetwork();
+    if (parseModel(*network) != NVDSINFER_SUCCESS) {
+        network->destroy();
+        return nullptr;
+    }
+
+    // Build the engine
+    std::cout << "Building the TensorRT Engine..." << std::endl;
+    nvinfer1::ICudaEngine * engine = builder->buildCudaEngine(*network);
+    if (engine) {
+        std::cout << "Building complete!" << std::endl;
+    } else {
+        std::cerr << "Building engine failed!" << std::endl;
+    }
+
+    // destroy
+    network->destroy();
+    return engine;
+}
+
+NvDsInferStatus Yolo::parseModel(nvinfer1::INetworkDefinition& network) {
+    destroyNetworkUtils();
+
+    m_ConfigBlocks = parseConfigFile(m_ConfigFilePath);
+    parseConfigBlocks();
+
+    std::vector<float> weights = loadWeights(m_WtsFilePath, m_NetworkType);
+    // build yolo network
+    std::cout << "Building Yolo network..." << std::endl;
+    NvDsInferStatus status = buildYoloNetwork(weights, network);
+
+    if (status == NVDSINFER_SUCCESS) {
+        std::cout << "Building yolo network complete!" << std::endl;
+    } else {
+        std::cerr << "Building yolo network failed!" << std::endl;
+    }
+
+    return status;
+}
+
+NvDsInferStatus Yolo::buildYoloNetwork(
+    std::vector<float>& weights, nvinfer1::INetworkDefinition& network) {
+    int weightPtr = 0;
+    int channels = m_InputC;
+
+    nvinfer1::ITensor* data =
+        network.addInput(m_InputBlobName.c_str(), nvinfer1::DataType::kFLOAT,
+            nvinfer1::DimsCHW{static_cast<int>(m_InputC),
+                static_cast<int>(m_InputH), static_cast<int>(m_InputW)});
+    assert(data != nullptr && data->getDimensions().nbDims > 0);
+
+    nvinfer1::ITensor* previous = data;
+    std::vector<nvinfer1::ITensor*> tensorOutputs;
+    uint outputTensorCount = 0;
+
+    // build the network using the network API
+    for (uint i = 0; i < m_ConfigBlocks.size(); ++i) {
+        // check if num. of channels is correct
+        assert(getNumChannels(previous) == channels);
+        std::string layerIndex = "(" + std::to_string(tensorOutputs.size()) + ")";
+
+        if (m_ConfigBlocks.at(i).at("type") == "net") {
+            printLayerInfo("", "layer", "     inp_size", "     out_size", "weightPtr");
+        } else if (m_ConfigBlocks.at(i).at("type") == "convolutional") {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            nvinfer1::ILayer* out;
+            std::string layerType;
+            // check if batch_norm enabled
+            if (m_ConfigBlocks.at(i).find("batch_normalize") !=
+                m_ConfigBlocks.at(i).end()) {
+                out = netAddConvBNLeaky(i, m_ConfigBlocks.at(i), weights,
+                    m_TrtWeights, weightPtr, channels, previous, &network);
+                layerType = "conv-bn-leaky";
+            }
+            else
+            {
+                out = netAddConvLinear(i, m_ConfigBlocks.at(i), weights,
+                    m_TrtWeights, weightPtr, channels, previous, &network);
+                layerType = "conv-linear";
+            }
+            previous = out->getOutput(0);
+            assert(previous != nullptr);
+            channels = getNumChannels(previous);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(out->getOutput(0));
+            printLayerInfo(layerIndex, layerType, inputVol, outputVol, std::to_string(weightPtr));
+        } else if (m_ConfigBlocks.at(i).at("type") == "shortcut") {
+            assert(m_ConfigBlocks.at(i).at("activation") == "linear");
+            assert(m_ConfigBlocks.at(i).find("from") !=
+                   m_ConfigBlocks.at(i).end());
+            int from = stoi(m_ConfigBlocks.at(i).at("from"));
+
+            std::string inputVol = dimsToString(previous->getDimensions());
+            // check if indexes are correct
+            assert((i - 2 >= 0) && (i - 2 < tensorOutputs.size()));
+            assert((i + from - 1 >= 0) && (i + from - 1 < tensorOutputs.size()));
+            assert(i + from - 1 < i - 2);
+            nvinfer1::IElementWiseLayer* ew = network.addElementWise(
+                *tensorOutputs[i - 2], *tensorOutputs[i + from - 1],
+                nvinfer1::ElementWiseOperation::kSUM);
+            assert(ew != nullptr);
+            std::string ewLayerName = "shortcut_" + std::to_string(i);
+            ew->setName(ewLayerName.c_str());
+            previous = ew->getOutput(0);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(ew->getOutput(0));
+            printLayerInfo(layerIndex, "skip", inputVol, outputVol, "    -");
+        } else if (m_ConfigBlocks.at(i).at("type") == "yolo") {
+            nvinfer1::Dims prevTensorDims = previous->getDimensions();
+            assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
+            TensorInfo& curYoloTensor = m_OutputTensors.at(outputTensorCount);
+            curYoloTensor.gridSize = prevTensorDims.d[1];
+            curYoloTensor.stride = m_InputW / curYoloTensor.gridSize;
+            m_OutputTensors.at(outputTensorCount).volume = curYoloTensor.gridSize
+                * curYoloTensor.gridSize
+                * (curYoloTensor.numBBoxes * (5 + curYoloTensor.numClasses));
+            std::string layerName = "yolo_" + std::to_string(i);
+            curYoloTensor.blobName = layerName;
+            nvinfer1::IPluginV2* yoloPlugin
+                = new YoloLayerV3(m_OutputTensors.at(outputTensorCount).numBBoxes,
+                                  m_OutputTensors.at(outputTensorCount).numClasses,
+                                  m_OutputTensors.at(outputTensorCount).gridSize);
+            assert(yoloPlugin != nullptr);
+            nvinfer1::IPluginV2Layer* yolo =
+                network.addPluginV2(&previous, 1, *yoloPlugin);
+            assert(yolo != nullptr);
+            yolo->setName(layerName.c_str());
+            std::string inputVol = dimsToString(previous->getDimensions());
+            previous = yolo->getOutput(0);
+            assert(previous != nullptr);
+            previous->setName(layerName.c_str());
+            std::string outputVol = dimsToString(previous->getDimensions());
+            network.markOutput(*previous);
+            channels = getNumChannels(previous);
+            tensorOutputs.push_back(yolo->getOutput(0));
+            printLayerInfo(layerIndex, "yolo", inputVol, outputVol, std::to_string(weightPtr));
+            ++outputTensorCount;
+        } else if (m_ConfigBlocks.at(i).at("type") == "region") {
+            nvinfer1::Dims prevTensorDims = previous->getDimensions();
+            assert(prevTensorDims.d[1] == prevTensorDims.d[2]);
+            TensorInfo& curRegionTensor = m_OutputTensors.at(outputTensorCount);
+            curRegionTensor.gridSize = prevTensorDims.d[1];
+            curRegionTensor.stride = m_InputW / curRegionTensor.gridSize;
+            m_OutputTensors.at(outputTensorCount).volume = curRegionTensor.gridSize
+                * curRegionTensor.gridSize
+                * (curRegionTensor.numBBoxes * (5 + curRegionTensor.numClasses));
+            std::string layerName = "region_" + std::to_string(i);
+            curRegionTensor.blobName = layerName;
+            nvinfer1::plugin::RegionParameters RegionParameters{
+                static_cast<int>(curRegionTensor.numBBoxes), 4,
+                static_cast<int>(curRegionTensor.numClasses), nullptr};
+            std::string inputVol = dimsToString(previous->getDimensions());
+            nvinfer1::IPluginV2* regionPlugin
+                = createRegionPlugin(RegionParameters);
+            assert(regionPlugin != nullptr);
+            nvinfer1::IPluginV2Layer* region =
+                network.addPluginV2(&previous, 1, *regionPlugin);
+            assert(region != nullptr);
+            region->setName(layerName.c_str());
+            previous = region->getOutput(0);
+            assert(previous != nullptr);
+            previous->setName(layerName.c_str());
+            std::string outputVol = dimsToString(previous->getDimensions());
+            network.markOutput(*previous);
+            channels = getNumChannels(previous);
+            tensorOutputs.push_back(region->getOutput(0));
+            printLayerInfo(layerIndex, "region", inputVol, outputVol, std::to_string(weightPtr));
+            std::cout << "Anchors are being converted to network input resolution i.e. Anchors x "
+                      << curRegionTensor.stride << " (stride)" << std::endl;
+            for (auto& anchor : curRegionTensor.anchors) anchor *= curRegionTensor.stride;
+            ++outputTensorCount;
+        } else if (m_ConfigBlocks.at(i).at("type") == "reorg") {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            nvinfer1::IPluginV2* reorgPlugin = createReorgPlugin(2);
+            assert(reorgPlugin != nullptr);
+            nvinfer1::IPluginV2Layer* reorg =
+                network.addPluginV2(&previous, 1, *reorgPlugin);
+            assert(reorg != nullptr);
+
+            std::string layerName = "reorg_" + std::to_string(i);
+            reorg->setName(layerName.c_str());
+            previous = reorg->getOutput(0);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            channels = getNumChannels(previous);
+            tensorOutputs.push_back(reorg->getOutput(0));
+            printLayerInfo(layerIndex, "reorg", inputVol, outputVol, std::to_string(weightPtr));
+        }
+        // route layers (single or concat)
+        else if (m_ConfigBlocks.at(i).at("type") == "route") {
+            std::string strLayers = m_ConfigBlocks.at(i).at("layers");
+            std::vector<int> idxLayers;
+            size_t lastPos = 0, pos = 0;
+            while ((pos = strLayers.find(',', lastPos)) != std::string::npos) {
+                int vL = std::stoi(trim(strLayers.substr(lastPos, pos - lastPos)));
+                idxLayers.push_back (vL);
+                lastPos = pos + 1;
+            }
+            if (lastPos < strLayers.length()) {
+                std::string lastV = trim(strLayers.substr(lastPos));
+                if (!lastV.empty()) {
+                    idxLayers.push_back (std::stoi(lastV));
+                }
+            }
+            assert (!idxLayers.empty());
+            std::vector<nvinfer1::ITensor*> concatInputs;
+            for (int idxLayer : idxLayers) {
+                if (idxLayer < 0) {
+                    idxLayer = tensorOutputs.size() + idxLayer;
+                }
+                assert (idxLayer >= 0 && idxLayer < (int)tensorOutputs.size());
+                concatInputs.push_back (tensorOutputs[idxLayer]);
+            }
+            nvinfer1::IConcatenationLayer* concat =
+                network.addConcatenation(concatInputs.data(), concatInputs.size());
+            assert(concat != nullptr);
+            std::string concatLayerName = "route_" + std::to_string(i - 1);
+            concat->setName(concatLayerName.c_str());
+            // concatenate along the channel dimension
+            concat->setAxis(0);
+            previous = concat->getOutput(0);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            // set the output volume depth
+            channels
+                = getNumChannels(previous);
+            tensorOutputs.push_back(concat->getOutput(0));
+            printLayerInfo(layerIndex, "route", "        -", outputVol,
+                           std::to_string(weightPtr));
+        } else if (m_ConfigBlocks.at(i).at("type") == "upsample") {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            nvinfer1::ILayer* out = netAddUpsample(i - 1, m_ConfigBlocks[i],
+                weights, m_TrtWeights, channels, previous, &network);
+            previous = out->getOutput(0);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(out->getOutput(0));
+            printLayerInfo(layerIndex, "upsample", inputVol, outputVol, "    -");
+        } else if (m_ConfigBlocks.at(i).at("type") == "maxpool") {
+            std::string inputVol = dimsToString(previous->getDimensions());
+            nvinfer1::ILayer* out =
+                netAddMaxpool(i, m_ConfigBlocks.at(i), previous, &network);
+            previous = out->getOutput(0);
+            assert(previous != nullptr);
+            std::string outputVol = dimsToString(previous->getDimensions());
+            tensorOutputs.push_back(out->getOutput(0));
+            printLayerInfo(layerIndex, "maxpool", inputVol, outputVol, std::to_string(weightPtr));
+        }
+        else
+        {
+            std::cout << "Unsupported layer type --> \""
+                      << m_ConfigBlocks.at(i).at("type") << "\"" << std::endl;
+            assert(0);
+        }
+    }
+
+    if ((int)weights.size() != weightPtr)
+    {
+        std::cout << "Number of unused weights left : " << weights.size() - weightPtr << std::endl;
+        assert(0);
+    }
+
+    std::cout << "Output yolo blob names :" << std::endl;
+    for (auto& tensor : m_OutputTensors) {
+        std::cout << tensor.blobName << std::endl;
+    }
+
+    int nbLayers = network.getNbLayers();
+    std::cout << "Total number of yolo layers: " << nbLayers << std::endl;
+
+    return NVDSINFER_SUCCESS;
+}
+
+std::vector<std::map<std::string, std::string>>
+Yolo::parseConfigFile (const std::string cfgFilePath)
+{
+    assert(fileExists(cfgFilePath));
+    std::ifstream file(cfgFilePath);
+    assert(file.good());
+    std::string line;
+    std::vector<std::map<std::string, std::string>> blocks;
+    std::map<std::string, std::string> block;
+
+    while (getline(file, line))
+    {
+        if (line.size() == 0) continue;
+        if (line.front() == '#') continue;
+        line = trim(line);
+        if (line.front() == '[')
+        {
+            if (block.size() > 0)
+            {
+                blocks.push_back(block);
+                block.clear();
+            }
+            std::string key = "type";
+            std::string value = trim(line.substr(1, line.size() - 2));
+            block.insert(std::pair<std::string, std::string>(key, value));
+        }
+        else
+        {
+            int cpos = line.find('=');
+            std::string key = trim(line.substr(0, cpos));
+            std::string value = trim(line.substr(cpos + 1));
+            block.insert(std::pair<std::string, std::string>(key, value));
+        }
+    }
+    blocks.push_back(block);
+    return blocks;
+}
+
+void Yolo::parseConfigBlocks()
+{
+    for (auto block : m_ConfigBlocks) {
+        if (block.at("type") == "net")
+        {
+            assert((block.find("height") != block.end())
+                   && "Missing 'height' param in network cfg");
+            assert((block.find("width") != block.end()) && "Missing 'width' param in network cfg");
+            assert((block.find("channels") != block.end())
+                   && "Missing 'channels' param in network cfg");
+
+            m_InputH = std::stoul(block.at("height"));
+            m_InputW = std::stoul(block.at("width"));
+            m_InputC = std::stoul(block.at("channels"));
+            assert(m_InputW == m_InputH);
+            m_InputSize = m_InputC * m_InputH * m_InputW;
+        }
+        else if ((block.at("type") == "region") || (block.at("type") == "yolo"))
+        {
+            assert((block.find("num") != block.end())
+                   && std::string("Missing 'num' param in " + block.at("type") + " layer").c_str());
+            assert((block.find("classes") != block.end())
+                   && std::string("Missing 'classes' param in " + block.at("type") + " layer")
+                          .c_str());
+            assert((block.find("anchors") != block.end())
+                   && std::string("Missing 'anchors' param in " + block.at("type") + " layer")
+                          .c_str());
+
+            TensorInfo outputTensor;
+            std::string anchorString = block.at("anchors");
+            while (!anchorString.empty())
+            {
+                int npos = anchorString.find_first_of(',');
+                if (npos != -1)
+                {
+                    float anchor = std::stof(trim(anchorString.substr(0, npos)));
+                    outputTensor.anchors.push_back(anchor);
+                    anchorString.erase(0, npos + 1);
+                }
+                else
+                {
+                    float anchor = std::stof(trim(anchorString));
+                    outputTensor.anchors.push_back(anchor);
+                    break;
+                }
+            }
+
+            if ((m_NetworkType == "yolov3") || (m_NetworkType == "yolov3-tiny"))
+            {
+                assert((block.find("mask") != block.end())
+                       && std::string("Missing 'mask' param in " + block.at("type") + " layer")
+                              .c_str());
+
+                std::string maskString = block.at("mask");
+                while (!maskString.empty())
+                {
+                    int npos = maskString.find_first_of(',');
+                    if (npos != -1)
+                    {
+                        uint mask = std::stoul(trim(maskString.substr(0, npos)));
+                        outputTensor.masks.push_back(mask);
+                        maskString.erase(0, npos + 1);
+                    }
+                    else
+                    {
+                        uint mask = std::stoul(trim(maskString));
+                        outputTensor.masks.push_back(mask);
+                        break;
+                    }
+                }
+            }
+
+            outputTensor.numBBoxes = outputTensor.masks.size() > 0
+                ? outputTensor.masks.size()
+                : std::stoul(trim(block.at("num")));
+            outputTensor.numClasses = std::stoul(block.at("classes"));
+            m_OutputTensors.push_back(outputTensor);
+        }
+    }
+}
+
+void Yolo::destroyNetworkUtils() {
+    // deallocate the weights
+    for (uint i = 0; i < m_TrtWeights.size(); ++i) {
+        if (m_TrtWeights[i].count > 0)
+            free(const_cast<void*>(m_TrtWeights[i].values));
+    }
+    m_TrtWeights.clear();
+}
+
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/yolo.h
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/yolo.h
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _YOLO_H_
+#define _YOLO_H_
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "NvInfer.h"
+#include "trt_utils.h"
+
+#include "nvdsinfer_custom_impl.h"
+
+/**
+ * Holds all the file paths required to build a network.
+ */
+struct NetworkInfo
+{
+    std::string networkType;
+    std::string configFilePath;
+    std::string wtsFilePath;
+    std::string deviceType;
+    std::string inputBlobName;
+};
+
+/**
+ * Holds information about an output tensor of the yolo network.
+ */
+struct TensorInfo
+{
+    std::string blobName;
+    uint stride{0};
+    uint gridSize{0};
+    uint numClasses{0};
+    uint numBBoxes{0};
+    uint64_t volume{0};
+    std::vector<uint> masks;
+    std::vector<float> anchors;
+    int bindingIndex{-1};
+    float* hostBuffer{nullptr};
+};
+
+class Yolo : public IModelParser {
+public:
+    Yolo(const NetworkInfo& networkInfo);
+    ~Yolo() override;
+    bool hasFullDimsSupported() const override { return false; }
+    const char* getModelName() const override {
+        return m_ConfigFilePath.empty() ? m_NetworkType.c_str()
+                                        : m_ConfigFilePath.c_str();
+    }
+    NvDsInferStatus parseModel(nvinfer1::INetworkDefinition& network) override;
+
+    nvinfer1::ICudaEngine *createEngine (nvinfer1::IBuilder* builder);
+
+protected:
+    const std::string m_NetworkType;
+    const std::string m_ConfigFilePath;
+    const std::string m_WtsFilePath;
+    const std::string m_DeviceType;
+    const std::string m_InputBlobName;
+    std::vector<TensorInfo> m_OutputTensors;
+    std::vector<std::map<std::string, std::string>> m_ConfigBlocks;
+    uint m_InputH;
+    uint m_InputW;
+    uint m_InputC;
+    uint64_t m_InputSize;
+
+    // TRT specific members
+    std::vector<nvinfer1::Weights> m_TrtWeights;
+
+private:
+    NvDsInferStatus buildYoloNetwork(
+        std::vector<float>& weights, nvinfer1::INetworkDefinition& network);
+    std::vector<std::map<std::string, std::string>> parseConfigFile(
+        const std::string cfgFilePath);
+    void parseConfigBlocks();
+    void destroyNetworkUtils();
+};
+
+#endif // _YOLO_H_
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.cpp
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "yoloPlugins.h"
+#include "NvInferPlugin.h"
+#include <cassert>
+#include <iostream>
+#include <memory>
+
+namespace {
+template <typename T>
+void write(char*& buffer, const T& val)
+{
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
+
+template <typename T>
+void read(const char*& buffer, T& val)
+{
+    val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+}
+} //namespace
+
+// Forward declaration of cuda kernels
+cudaError_t cudaYoloLayerV3 (
+    const void* input, void* output, const uint& batchSize,
+    const uint& gridSize, const uint& numOutputClasses,
+    const uint& numBBoxes, uint64_t outputSize, cudaStream_t stream);
+
+YoloLayerV3::YoloLayerV3 (const void* data, size_t length)
+{
+    const char *d = static_cast<const char*>(data);
+    read(d, m_NumBoxes);
+    read(d, m_NumClasses);
+    read(d, m_GridSize);
+    read(d, m_OutputSize);
+};
+
+YoloLayerV3::YoloLayerV3 (
+    const uint& numBoxes, const uint& numClasses, const uint& gridSize) :
+    m_NumBoxes(numBoxes),
+    m_NumClasses(numClasses),
+    m_GridSize(gridSize)
+{
+    assert(m_NumBoxes > 0);
+    assert(m_NumClasses > 0);
+    assert(m_GridSize > 0);
+    m_OutputSize = m_GridSize * m_GridSize * (m_NumBoxes * (4 + 1 + m_NumClasses));
+};
+
+nvinfer1::Dims
+YoloLayerV3::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nbInputDims)
+{
+    assert(index == 0);
+    assert(nbInputDims == 1);
+    return inputs[0];
+}
+
+bool YoloLayerV3::supportsFormat (
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
+    return (type == nvinfer1::DataType::kFLOAT &&
+            format == nvinfer1::PluginFormat::kNCHW);
+}
+
+void
+YoloLayerV3::configureWithFormat (
+    const nvinfer1::Dims* inputDims, int nbInputs,
+    const nvinfer1::Dims* outputDims, int nbOutputs,
+    nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize)
+{
+    assert(nbInputs == 1);
+    assert (format == nvinfer1::PluginFormat::kNCHW);
+    assert(inputDims != nullptr);
+}
+
+int YoloLayerV3::enqueue(
+    int batchSize, const void* const* inputs, void** outputs, void* workspace,
+    cudaStream_t stream)
+{
+    CHECK(cudaYoloLayerV3(
+              inputs[0], outputs[0], batchSize, m_GridSize, m_NumClasses, m_NumBoxes,
+              m_OutputSize, stream));
+    return 0;
+}
+
+size_t YoloLayerV3::getSerializationSize() const
+{
+    return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(m_GridSize) + sizeof(m_OutputSize);
+}
+
+void YoloLayerV3::serialize(void* buffer) const
+{
+    char *d = static_cast<char*>(buffer);
+    write(d, m_NumBoxes);
+    write(d, m_NumClasses);
+    write(d, m_GridSize);
+    write(d, m_OutputSize);
+}
+
+nvinfer1::IPluginV2* YoloLayerV3::clone() const
+{
+    return new YoloLayerV3 (m_NumBoxes, m_NumClasses, m_GridSize);
+}
+
+REGISTER_TENSORRT_PLUGIN(YoloLayerV3PluginCreator);
--- a/DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+++ b/DeepStream/nvdsinfer_custom_impl_Yolo/yoloPlugins.h
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __YOLO_PLUGINS__
+#define __YOLO_PLUGINS__
+
+#include <cassert>
+#include <cstring>
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <memory>
+
+#include "NvInferPlugin.h"
+
+#define CHECK(status)                                                                              \
+    {                                                                                              \
+        if (status != 0)                                                                           \
+        {                                                                                          \
+            std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
+                      << " at line " << __LINE__ << std::endl;                                     \
+            abort();                                                                               \
+        }                                                                                          \
+    }
+
+namespace
+{
+const char* YOLOV3LAYER_PLUGIN_VERSION {"1"};
+const char* YOLOV3LAYER_PLUGIN_NAME {"YoloLayerV3_TRT"};
+} // namespace
+
+class YoloLayerV3 : public nvinfer1::IPluginV2
+{
+public:
+    YoloLayerV3 (const void* data, size_t length);
+    YoloLayerV3 (const uint& numBoxes, const uint& numClasses, const uint& gridSize);
+    const char* getPluginType () const override { return YOLOV3LAYER_PLUGIN_NAME; }
+    const char* getPluginVersion () const override { return YOLOV3LAYER_PLUGIN_VERSION; }
+    int getNbOutputs () const override { return 1; }
+
+    nvinfer1::Dims getOutputDimensions (
+        int index, const nvinfer1::Dims* inputs,
+        int nbInputDims) override;
+
+    bool supportsFormat (
+        nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
+
+    void configureWithFormat (
+        const nvinfer1::Dims* inputDims, int nbInputs,
+        const nvinfer1::Dims* outputDims, int nbOutputs,
+        nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
+
+    int initialize () override { return 0; }
+    void terminate () override {}
+    size_t getWorkspaceSize (int maxBatchSize) const override { return 0; }
+    int enqueue (
+        int batchSize, const void* const* inputs, void** outputs,
+        void* workspace, cudaStream_t stream) override;
+    size_t getSerializationSize() const override;
+    void serialize (void* buffer) const override;
+    void destroy () override { delete this; }
+    nvinfer1::IPluginV2* clone() const override;
+
+    void setPluginNamespace (const char* pluginNamespace)override {
+        m_Namespace = pluginNamespace;
+    }
+    virtual const char* getPluginNamespace () const override {
+        return m_Namespace.c_str();
+    }
+
+private:
+    uint m_NumBoxes {0};
+    uint m_NumClasses {0};
+    uint m_GridSize {0};
+    uint64_t m_OutputSize {0};
+    std::string m_Namespace {""};
+};
+
+class YoloLayerV3PluginCreator : public nvinfer1::IPluginCreator
+{
+public:
+    YoloLayerV3PluginCreator () {}
+    ~YoloLayerV3PluginCreator () {}
+
+    const char* getPluginName () const override { return YOLOV3LAYER_PLUGIN_NAME; }
+    const char* getPluginVersion () const override { return YOLOV3LAYER_PLUGIN_VERSION; }
+
+    const nvinfer1::PluginFieldCollection* getFieldNames() override {
+        std::cerr<< "YoloLayerV3PluginCreator::getFieldNames is not implemented" << std::endl;
+        return nullptr;
+    }
+
+    nvinfer1::IPluginV2* createPlugin (
+        const char* name, const nvinfer1::PluginFieldCollection* fc) override
+    {
+        std::cerr<< "YoloLayerV3PluginCreator::getFieldNames is not implemented.\n";
+        return nullptr;
+    }
+
+    nvinfer1::IPluginV2* deserializePlugin (
+        const char* name, const void* serialData, size_t serialLength) override
+    {
+        std::cout << "Deserialize yoloLayerV3 plugin: " << name << std::endl;
+        return new YoloLayerV3(serialData, serialLength);
+    }
+
+    void setPluginNamespace(const char* libNamespace) override {
+        m_Namespace = libNamespace;
+    }
+    const char* getPluginNamespace() const override {
+        return m_Namespace.c_str();
+    }
+
+private:
+    std::string m_Namespace {""};
+};
+
+#endif // __YOLO_PLUGINS__
--- a/License.txt
+++ b/License.txt
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/camera.py
+++ b/camera.py
+import _thread
+import queue
+import threading
+import time
+from socket import *
+
+import cv2
+import numpy as np
+
+from tool.utils import load_class_names, plot_boxes_cv2
+
+ip_add = '127.0.0.1'
+server_port = 25000
+connect_port = 25003
+
+
+def send_from(arr, dest):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nsent = dest.send(view)
+        view = view[nsent:]
+
+
+def recv_into(arr, source):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nrecv = source.recv_into(view)
+        view = view[nrecv:]
+
+
+c_2 = socket(AF_INET, SOCK_STREAM)
+c_2.connect((ip_add, connect_port))
+s = socket(AF_INET, SOCK_STREAM)
+s.bind(('', server_port))
+s.listen(3)
+
+qsize = 1
+boxQue = queue.Queue(qsize)
+img_sent = queue.Queue(qsize * 20)
+lock = threading.Lock()
+# time.sleep(10)
+
+fps = 0
+fps_dis = 0
+
+
+def recv_box():
+    lth = np.zeros(shape=(1,), dtype=np.int32)
+    while 1:
+        if boxQue.full():
+            # print('box is full')
+            time.sleep(0.1)
+        else:
+            recv_into(lth, c_2)
+            if lth[0] == 0:
+                lock.acquire()
+                boxQue.put([0])
+                lock.release()
+                continue
+            arr = np.zeros(shape=(1, lth[0], 7), dtype=np.float32)
+            recv_into(arr, c_2)
+            box = arr.tolist()
+            for i in range(lth[0]):
+                box[0][i][-1] = np.int64(box[0][i][-1])
+            lock.acquire()
+            boxQue.put(box)
+            lock.release()
+
+
+# sum_flag = np.zeros(shape=(1,), dtype=np.int32)
+# def recv_flag():
+#     global sum_flag
+#     recv_into(sum_flag, c)
+#     print('done')
+
+def cam_send():
+    c, a = s.accept()
+    cap = cv2.VideoCapture(0)
+    flag = cap.isOpened()
+    print(flag)
+    # _thread.start_new_thread(recv_flag, ())
+    cnt_arr = np.zeros(shape=(1,), dtype=np.int32)
+    while 1:
+        _, img = cap.read()
+        send_from(img, c)
+        send_from(np.array([np.sum(img)]), c)
+        recv_into(cnt_arr, c)
+        if cnt_arr[0] >= 5:
+            break
+    while 1:
+        while img_sent.full():
+            # print('sent is full')
+            time.sleep(0.1)
+        _, img = cap.read()
+        # print(img)
+        send_from(img, c)
+        lock.acquire()
+        img_sent.put(img)
+        lock.release()
+        # print(np.sum(img))
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+    cap.release()
+
+
+def fps_update():
+    global fps, fps_dis
+    while 1:
+        time.sleep(10)
+        print(fps)
+        fps_dis = fps / 10
+        fps = 0
+
+
+_thread.start_new_thread(recv_box, ())
+_thread.start_new_thread(cam_send, ())
+_thread.start_new_thread(fps_update, ())
+
+# def get_box():
+#     while boxQue.empty():
+#         time.sleep(0.1)
+#     lock.acquire()
+#     box = boxQue.get()
+#     lock.release()
+#     return box
+
+namesfile = 'data/coco.names'
+class_names = load_class_names(namesfile)
+while (1):
+    # get a frame
+    while img_sent.empty() or boxQue.empty():
+        # print('sent or box are empty')
+        time.sleep(0.1)
+    lock.acquire()
+    img = img_sent.get()
+    boxes = boxQue.get()
+    lock.release()
+    # print(np.sum(img))
+    # start = time.time()
+    if boxes[0] == 0:
+        pass
+    else:
+        img = plot_boxes_cv2(img, boxes[0], 'predictions.jpg', class_names)
+    img = cv2.putText(img, 'FPS: {}'.format(fps_dis), (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 0), 2)
+    # end = time.time()
+    # print('time: ', end - start)
+    cv2.imshow('fps:', img)
+    fps += 1
+    # send_from(frame, c_3)
+    if cv2.waitKey(1) & 0xFF == ord('q'):
+        break
+
+cv2.destroyAllWindows()
+#c.close()
+s.close()
--- a/cfg/yolov3-tiny.cfg
+++ b/cfg/yolov3-tiny.cfg
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=2
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+# 0
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 1
+[maxpool]
+size=2
+stride=2
+
+# 2
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 3
+[maxpool]
+size=2
+stride=2
+
+# 4
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 5
+[maxpool]
+size=2
+stride=2
+
+# 6
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 7
+[maxpool]
+size=2
+stride=2
+
+# 8
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 9
+[maxpool]
+size=2
+stride=2
+
+# 10
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 11
+[maxpool]
+size=2
+stride=1
+
+# 12
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+# 13
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 14
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 15
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+# 16
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+# 17
+[route]
+layers = -4
+
+# 18
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 19
+[upsample]
+stride=2
+
+# 20
+[route]
+layers = -1, 8
+
+# 21
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 22
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+# 23
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
--- a/cfg/yolov3.cfg
+++ b/cfg/yolov3.cfg
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
--- a/cfg/yolov4-custom.cfg
+++ b/cfg/yolov4-custom.cfg
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.949
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500500
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+#cutmix=1
+mosaic=1
+
+#:104x104 54:52x52 85:26x26 104:13x13 for 416
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-7
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-10
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-16
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=mish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+### SPP ###
+[maxpool]
+stride=1
+size=5
+
+[route]
+layers=-2
+
+[maxpool]
+stride=1
+size=9
+
+[route]
+layers=-4
+
+[maxpool]
+stride=1
+size=13
+
+[route]
+layers=-1,-3,-5,-6
+### End SPP ###
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 85
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 54
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+##########################
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.2
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=256
+activation=leaky
+
+[route]
+layers = -1, -16
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.1
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=512
+activation=leaky
+
+[route]
+layers = -1, -37
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+scale_x_y = 1.05
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+
--- a/cfg/yolov4-tiny.cfg
+++ b/cfg/yolov4-tiny.cfg
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=1
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.00261
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+##################################
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+scale_x_y = 1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+resize=1.5
+nms_kind=greedynms
+beta_nms=0.6
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 23
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+scale_x_y = 1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+resize=1.5
+nms_kind=greedynms
+beta_nms=0.6
--- a/cfg/yolov4.cfg
+++ b/cfg/yolov4.cfg
+[net]
+batch=64
+subdivisions=8
+# Training
+#width=512
+#height=512
+width=608
+height=608
+channels=3
+momentum=0.949
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0013
+burn_in=1000
+max_batches = 500500
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+#cutmix=1
+mosaic=1
+
+#:104x104 54:52x52 85:26x26 104:13x13 for 416
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-7
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-10
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-16
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=mish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+### SPP ###
+[maxpool]
+stride=1
+size=5
+
+[route]
+layers=-2
+
+[maxpool]
+stride=1
+size=9
+
+[route]
+layers=-4
+
+[maxpool]
+stride=1
+size=13
+
+[route]
+layers=-1,-3,-5,-6
+### End SPP ###
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 85
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 54
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+##########################
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.2
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=256
+activation=leaky
+
+[route]
+layers = -1, -16
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.1
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=512
+activation=leaky
+
+[route]
+layers = -1, -37
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+scale_x_y = 1.05
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
--- a/data/coco.names
+++ b/data/coco.names
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/data/voc.names
+++ b/data/voc.names
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
--- a/edges/edge1/Dockerfile
+++ b/edges/edge1/Dockerfile
+FROM ubuntu:18.04
+RUN apt-get -yqq update
+RUN  sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN  apt-get clean
+RUN apt-get -yqq update
+RUN apt-get install -yqq openssh-client openssh-server
+RUN echo 'root:PASSWORD' | chpasswd
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN service ssh restart
+
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get install -y python3.9
+RUN apt-get autoremove -y python3
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+RUN ln -s /usr/bin/python3.9 /usr/bin/python3
+RUN apt-get install -y python3.9-distutils
+RUN apt-get install -y wget
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN python get-pip.py
+RUN pip3 -V
+RUN ln -s /usr/local/bin/pip3 /usr/bin/pip3
+RUN apt-get -yqq install libssl-dev libffi-dev gcc python3.9-dev libgl1-mesa-glx libsm6 libxext6 libglib2.0-0
+RUN apt-get -yqq update
+RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+
+ADD requirements.txt /edge1/requirements.txt
+#RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements.txt
+RUN pip3 install -r /edge1/requirements.txt
+WORKDIR /edge1
+ADD . /edge1
+CMD ["python", "models.py"]
\ No newline at end of file
--- a/edges/edge1/config_edge.py
+++ b/edges/edge1/config_edge.py
+import logging
+import threading
+import queue
+from socket import *
+import numpy as np
+import _thread
+import time
+
+qsize = 1
+ip_add = '127.0.0.1'
+server_port = 25001
+connect_port = 25000
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+
+
+def send_from(arr, dest):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nsent = dest.send(view)
+        view = view[nsent:]
+
+
+def recv_into(arr, source):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nrecv = source.recv_into(view)
+        view = view[nrecv:]
+
+
+def doConnect(host, port):
+    sock = socket(AF_INET, SOCK_STREAM)
+    sock.settimeout(20)
+    flag = True
+    while flag:
+        try:
+            if flag:
+                log.info("try connect %s : %d", host, port)
+                sock.connect((host, port))
+                flag = False
+                log.info("try connect %s : %d SUCCESS", host, port)
+        except Exception as e:
+            log.error("Address-related error connecting to server: %s" % e)
+        time.sleep(3)
+    return sock
+
+
+class trans_thread(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+        self.lock = threading.Lock()
+        self.imgQue = queue.Queue(qsize)
+        self.d2Que = queue.Queue(qsize)
+        self.server = socket(AF_INET, SOCK_STREAM)
+        self.server.bind(('', server_port))
+        self.server.listen(3)
+        log.info("bind %d", server_port)
+        self.a_clinet, addr = self.server.accept()
+        self.client = doConnect(ip_add, connect_port)
+        log.info('edge 1 init successfully')
+
+    def put_d2(self, d2):
+        while self.d2Que.full():
+            # print('d2 is full')
+            time.sleep(0.1)
+        d2 = d2.detach().numpy()
+        # print(len(d2))
+        # print(d2.dtype)
+        self.lock.acquire()
+        self.d2Que.put(d2)
+        self.lock.release()
+
+    def get_img(self):
+        while self.imgQue.empty():
+            # print('img is empty')
+            time.sleep(0.1)
+        self.lock.acquire()
+        img = self.imgQue.get()
+        self.lock.release()
+        # print('2: ', np.sum(img))
+        return img
+
+    def recv(self):
+        try:
+            arr = np.zeros(shape=(480, 640, 3), dtype=np.uint8)
+            img_sum = np.zeros(shape=(1,), dtype=np.int32)
+            cnt = 0
+            while 1:
+                recv_into(arr, self.client)
+                recv_into(img_sum, self.client)
+                if img_sum[0] == np.sum(arr):
+                    cnt += 1
+                else:
+                    cnt = 0
+                send_from(np.array([cnt]), self.client)
+                if cnt >= 5:
+                    break
+            while 1:
+                if not self.imgQue.full():
+                    recv_into(arr, self.client)
+                    self.lock.acquire()
+                    self.imgQue.put(arr)
+                    self.lock.release()
+                    # print('1: ', np.sum(arr))
+                else:
+                    # print('img is full')
+                    time.sleep(0.1)
+        except Exception as e:
+            log.error("connecting error: %s" % e)
+            self.client = doConnect(ip_add, connect_port)
+
+    def send(self):
+        while 1:
+            if not self.d2Que.empty():
+                self.lock.acquire()
+                d2 = self.d2Que.get()
+                self.lock.release()
+                send_from(d2, self.a_clinet)
+            else:
+                # print('d2 is empty')
+                time.sleep(0.1)
+
+    def run(self):
+        _thread.start_new_thread(self.recv, ())
+        _thread.start_new_thread(self.send, ())
+
+# def print_time(threadName, delay, counter):
+#     while counter:
+#         if exitFlag:
+#             threadName.exit()
+#         time.sleep(delay)
+#         print ("%s: %s" % (threadName, time.ctime(time.time())))
+#         counter -= 1
--- a/edges/edge1/deployEdge1.yaml
+++ b/edges/edge1/deployEdge1.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: edge1
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: edge1
+  template:
+    metadata:
+      labels:
+        app: edge1
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        kubernetes.io/hostname: node1
+      containers:
+      - name: edge1
+        image: k8s-master:5000/edge/edge1:v1
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 25001
+
+#---
+#apiVersion: v1
+#kind: Service
+#metadata:
+#  name: edge1
+#spec:
+#  type: NodePort
+#  selector:
+#    app: edge1
+#  ports:
+#    - name: tcp
+#      port: 32001
+#      targetPort: 25001
+#      nodePort: 32001
--- a/edges/edge1/run_edge.py
+++ b/edges/edge1/run_edge.py
+import logging
+
+import cv2
+import torch.nn.functional as F
+from torch import nn
+
+from tool.torch_utils import *
+from tool.yolo_layer import YoloLayer
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self):
+        super(Upsample, self).__init__()
+
+    def forward(self, x, target_size, inference=False):
+        assert (x.data.dim() == 4)
+        # _, _, tH, tW = target_size
+
+        if inference:
+
+            # B = x.data.size(0)
+            # C = x.data.size(1)
+            # H = x.data.size(2)
+            # W = x.data.size(3)
+
+            return x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1). \
+                expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3),
+                       target_size[3] // x.size(3)). \
+                contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
+        else:
+            return F.interpolate(x, size=(target_size[2], target_size[3]), mode='nearest')
+
+
+class Conv_Bn_Activation(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False):
+        super().__init__()
+        pad = (kernel_size - 1) // 2
+
+        self.conv = nn.ModuleList()
+        if bias:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad))
+        else:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False))
+        if bn:
+            self.conv.append(nn.BatchNorm2d(out_channels))
+        if activation == "mish":
+            self.conv.append(Mish())
+        elif activation == "relu":
+            self.conv.append(nn.ReLU(inplace=True))
+        elif activation == "leaky":
+            self.conv.append(nn.LeakyReLU(0.1, inplace=True))
+        elif activation == "linear":
+            pass
+        else:
+            print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                       sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+    def forward(self, x):
+        for l in self.conv:
+            x = l(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    """
+    Sequential residual blocks each of which consists of \
+    two convolution layers.
+    Args:
+        ch (int): number of input and output channels.
+        nblocks (int): number of residual blocks.
+        shortcut (bool): if True, residual tensor addition is enabled.
+    """
+
+    def __init__(self, ch, nblocks=1, shortcut=True):
+        super().__init__()
+        self.shortcut = shortcut
+        self.module_list = nn.ModuleList()
+        for i in range(nblocks):
+            resblock_one = nn.ModuleList()
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish'))
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish'))
+            self.module_list.append(resblock_one)
+
+    def forward(self, x):
+        for module in self.module_list:
+            h = x
+            for res in module:
+                h = res(h)
+            x = x + h if self.shortcut else h
+        return x
+
+
+class DownSample1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish')
+
+        self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish')
+        self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -2
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+
+        self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish')
+        self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish')
+        # [shortcut]
+        # from=-3
+        # activation = linear
+
+        self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -1, -7
+        self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # route -2
+        x4 = self.conv4(x2)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        # shortcut -3
+        x6 = x6 + x4
+
+        x7 = self.conv7(x6)
+        # [route]
+        # layers = -1, -7
+        x7 = torch.cat([x7, x3], dim=1)
+        x8 = self.conv8(x7)
+        return x8
+
+
+class DownSample2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+        # r -2
+        self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=64, nblocks=2)
+
+        # s -3
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # r -1 -10
+        self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample3(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=128, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample4(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=256, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=512, nblocks=4)
+        self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class Neck(nn.Module):
+    def __init__(self, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        # SPP
+        self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2)
+
+        # R -1 -3 -5 -6
+        # SPP
+        self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # UP
+        self.upsample1 = Upsample()
+        # R 85
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # UP
+        self.upsample2 = Upsample()
+        # R 54
+        self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+
+    def forward(self, input, downsample4, downsample3, inference=False):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # SPP
+        m1 = self.maxpool1(x3)
+        m2 = self.maxpool2(x3)
+        m3 = self.maxpool3(x3)
+        spp = torch.cat([m3, m2, m1, x3], dim=1)
+        # SPP end
+        x4 = self.conv4(spp)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        # UP
+        up = self.upsample1(x7, downsample4.size(), self.inference)
+        # R 85
+        x8 = self.conv8(downsample4)
+        # R -1 -3
+        x8 = torch.cat([x8, up], dim=1)
+
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        # UP
+        up = self.upsample2(x14, downsample3.size(), self.inference)
+        # R 54
+        x15 = self.conv15(downsample3)
+        # R -1 -3
+        x15 = torch.cat([x15, up], dim=1)
+
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        x19 = self.conv19(x18)
+        x20 = self.conv20(x19)
+        return x20, x13, x6
+
+
+class Yolov4Head(nn.Module):
+    def __init__(self, output_ch, n_classes, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo1 = YoloLayer(
+            anchor_mask=[0, 1, 2], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=8)
+
+        # R -4
+        self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky')
+
+        # R -1 -16
+        self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo2 = YoloLayer(
+            anchor_mask=[3, 4, 5], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=16)
+
+        # R -4
+        self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky')
+
+        # R -1 -37
+        self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo3 = YoloLayer(
+            anchor_mask=[6, 7, 8], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=32)
+
+    def forward(self, input1, input2, input3):
+        x1 = self.conv1(input1)
+        x2 = self.conv2(x1)
+
+        x3 = self.conv3(input1)
+        # R -1 -16
+        x3 = torch.cat([x3, input2], dim=1)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+
+        # R -4
+        x11 = self.conv11(x8)
+        # R -1 -37
+        x11 = torch.cat([x11, input3], dim=1)
+
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+        x15 = self.conv15(x14)
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+
+        if self.inference:
+            y1 = self.yolo1(x2)
+            y2 = self.yolo2(x10)
+            y3 = self.yolo3(x18)
+
+            return get_region_boxes([y1, y2, y3])
+
+        else:
+            return [x2, x10, x18]
+
+
+class Yolov4(nn.Module):
+    def __init__(self, yolov4conv137weight=None, n_classes=80, inference=False):
+        super().__init__()
+
+        output_ch = (4 + 1 + n_classes) * 3
+
+        # backbone
+        self.down1 = DownSample1()
+        self.down2 = DownSample2()
+        self.down3 = DownSample3()
+        self.down4 = DownSample4()
+        self.down5 = DownSample5()
+        # neck
+        self.neek = Neck(inference)
+        # yolov4conv137
+        if yolov4conv137weight:
+            _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neek)
+            pretrained_dict = torch.load(yolov4conv137weight)
+
+            model_dict = _model.state_dict()
+            # 1. filter out unnecessary keys
+            pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)}
+            # 2. overwrite entries in the existing state dict
+            model_dict.update(pretrained_dict)
+            _model.load_state_dict(model_dict)
+
+        # head
+        self.head = Yolov4Head(output_ch, n_classes, inference)
+
+    def forward(self, input):
+        d1 = self.down1(input)
+        d2 = self.down2(d1)
+        d3 = self.down3(d2)
+        d4 = self.down4(d3)
+        d5 = self.down5(d4)
+
+        x20, x13, x6 = self.neek(d5, d4, d3)
+
+        output = self.head(x20, x13, x6)
+        return output
+
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+
+namesfile = 'data/coco.names'
+
+n_classes = 80
+weightfile = './yolov4.pth'
+height = 608
+width = 608
+
+model = Yolov4(yolov4conv137weight=None, n_classes=n_classes, inference=True)
+
+pretrained_dict = torch.load(weightfile, map_location=torch.device('cpu'))
+model.load_state_dict(pretrained_dict)
+
+model.eval()
+# time.sleep(10)
+
+from config_edge import trans_thread
+
+trans = trans_thread()
+trans.start()
+
+while 1:
+    img = trans.get_img()
+    # print(np.sum(img))
+    img = cv2.resize(img, (width, height))
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
+        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+    elif type(img) == np.ndarray and len(img.shape) == 4:
+        img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
+    else:
+        print("unknow image type")
+        exit(-1)
+    img = torch.autograd.Variable(img)
+
+    d1 = model.down1(img)
+    d2 = model.down2(d1)
+    log.info(d2.shape[0])
+    log.info(d2.shape[1])
+    log.info(d2.shape[2])
+    log.info(d2.shape[3])
+    trans.put_d2(d2)
--- a/edges/edge2/Dockerfile
+++ b/edges/edge2/Dockerfile
+FROM ubuntu:18.04
+RUN apt-get -yqq update
+RUN  sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN  apt-get clean
+RUN apt-get -yqq update
+RUN apt-get install -yqq openssh-client openssh-server
+RUN echo 'root:PASSWORD' | chpasswd
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN service ssh restart
+
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get install -y python3.9
+RUN apt-get autoremove -y python3
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+RUN ln -s /usr/bin/python3.9 /usr/bin/python3
+RUN apt-get install -y python3.9-distutils
+RUN apt-get install -y wget
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN python get-pip.py
+RUN pip3 -V
+RUN ln -s /usr/local/bin/pip3 /usr/bin/pip3
+RUN apt-get -yqq install libssl-dev libffi-dev gcc python3.9-dev libgl1-mesa-glx libsm6 libxext6 libglib2.0-0
+RUN apt-get -yqq update
+RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+
+ADD requirements.txt /edge2/requirements.txt
+#RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements.txt
+RUN pip3 install -r /edge2/requirements.txt
+WORKDIR /edge2
+ADD . /edge2
+CMD ["python", "models.py"]
\ No newline at end of file
--- a/edges/edge2/config_edge.py
+++ b/edges/edge2/config_edge.py
+import logging
+import threading
+import queue
+from socket import *
+import numpy as np
+import _thread
+import time
+
+qsize = 1
+ip_add = '127.0.0.1'
+server_port = 25002
+connect_port = 25001
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+
+
+def send_from(arr, dest):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nsent = dest.send(view)
+        view = view[nsent:]
+
+
+def recv_into(arr, source):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nrecv = source.recv_into(view)
+        view = view[nrecv:]
+
+
+def doConnect(host, port):
+    sock = socket(AF_INET, SOCK_STREAM)
+    sock.settimeout(20)
+    flag = True
+    while flag:
+        try:
+            if flag:
+                log.info("try connect %s : %d", host, port)
+                sock.connect((host, port))
+                flag = False
+        except Exception as e:
+            log.error("Address-related error connecting to server: %s" % e)
+        time.sleep(3)
+    return sock
+
+
+class trans_thread(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+        self.lock = threading.Lock()
+        self.d2Que = queue.Queue(qsize)
+        self.d4Que = queue.Queue(qsize)
+        self.server = socket(AF_INET, SOCK_STREAM)
+        self.server.bind(('', server_port))
+        self.server.listen(3)
+        log.info("bind %d", server_port)
+        self.a_client, _ = self.server.accept()
+        self.client = doConnect(ip_add, connect_port)
+        log.info('edge 2 init successfully')
+
+    def get_d2(self):
+        while self.d2Que.empty():
+            print('d2 is empty')
+            time.sleep(0.1)
+        self.lock.acquire()
+        d2 = self.d2Que.get()
+        self.lock.release()
+        return d2
+
+    def recv(self):
+        try:
+            arr = np.zeros(shape=(1, 128, 152, 152), dtype=np.float32)
+            while 1:
+                if not self.d2Que.full():
+                    recv_into(arr, self.client)
+                    self.lock.acquire()
+                    self.d2Que.put(arr)
+                    self.lock.release()
+                else:
+                    # print('d2 is full')
+                    time.sleep(0.1)
+        except Exception as e:
+            log.error("connecting error: %s" % e)
+            self.client = doConnect(ip_add, connect_port)
+
+    def send(self):
+        while 1:
+            if not self.d4Que.empty():
+                self.lock.acquire()
+                d4 = self.d4Que.get()
+                self.lock.release()
+                send_from(d4, self.a_client)
+            else:
+                time.sleep(0.1)
+
+    def put_d4(self, d4):
+        while self.d4Que.full():
+            # print('d2 is full')
+            time.sleep(0.1)
+        d4 = d4.detach().numpy()
+        self.lock.acquire()
+        self.d4Que.put(d4)
+        self.lock.release()
+
+    def run(self):
+        _thread.start_new_thread(self.recv, ())
+        _thread.start_new_thread(self.send, ())
+
+# def print_time(threadName, delay, counter):
+#     while counter:
+#         if exitFlag:
+#             threadName.exit()
+#         time.sleep(delay)
+#         print ("%s: %s" % (threadName, time.ctime(time.time())))
+#         counter -= 1
--- a/edges/edge2/deployEdge2.yaml
+++ b/edges/edge2/deployEdge2.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: edge2
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: edge2
+  template:
+    metadata:
+      labels:
+        app: edge2
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        kubernetes.io/hostname: node2
+      containers:
+      - name: edge2
+        image: k8s-master:5000/edge/edge2:v1
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 25002
+
+#---
+#apiVersion: v1
+#kind: Service
+#metadata:
+#  name: edge2
+#spec:
+#  type: NodePort
+#  selector:
+#    app: edge2
+#  ports:
+#    - name: tcp
+#      port: 32002
+#      targetPort: 25002
+#      nodePort: 32002
--- a/edges/edge2/run_edge.py
+++ b/edges/edge2/run_edge.py
+import sys
+
+import torch.nn.functional as F
+from torch import nn
+
+from tool.torch_utils import *
+from tool.yolo_layer import YoloLayer
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self):
+        super(Upsample, self).__init__()
+
+    def forward(self, x, target_size, inference=False):
+        assert (x.data.dim() == 4)
+        # _, _, tH, tW = target_size
+
+        if inference:
+
+            #B = x.data.size(0)
+            #C = x.data.size(1)
+            #H = x.data.size(2)
+            #W = x.data.size(3)
+
+            return x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+                    expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3), target_size[3] // x.size(3)).\
+                    contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
+        else:
+            return F.interpolate(x, size=(target_size[2], target_size[3]), mode='nearest')
+
+
+class Conv_Bn_Activation(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False):
+        super().__init__()
+        pad = (kernel_size - 1) // 2
+
+        self.conv = nn.ModuleList()
+        if bias:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad))
+        else:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False))
+        if bn:
+            self.conv.append(nn.BatchNorm2d(out_channels))
+        if activation == "mish":
+            self.conv.append(Mish())
+        elif activation == "relu":
+            self.conv.append(nn.ReLU(inplace=True))
+        elif activation == "leaky":
+            self.conv.append(nn.LeakyReLU(0.1, inplace=True))
+        elif activation == "linear":
+            pass
+        else:
+            print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                       sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+    def forward(self, x):
+        for l in self.conv:
+            x = l(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    """
+    Sequential residual blocks each of which consists of \
+    two convolution layers.
+    Args:
+        ch (int): number of input and output channels.
+        nblocks (int): number of residual blocks.
+        shortcut (bool): if True, residual tensor addition is enabled.
+    """
+
+    def __init__(self, ch, nblocks=1, shortcut=True):
+        super().__init__()
+        self.shortcut = shortcut
+        self.module_list = nn.ModuleList()
+        for i in range(nblocks):
+            resblock_one = nn.ModuleList()
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish'))
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish'))
+            self.module_list.append(resblock_one)
+
+    def forward(self, x):
+        for module in self.module_list:
+            h = x
+            for res in module:
+                h = res(h)
+            x = x + h if self.shortcut else h
+        return x
+
+
+class DownSample1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish')
+
+        self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish')
+        self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -2
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+
+        self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish')
+        self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish')
+        # [shortcut]
+        # from=-3
+        # activation = linear
+
+        self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -1, -7
+        self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # route -2
+        x4 = self.conv4(x2)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        # shortcut -3
+        x6 = x6 + x4
+
+        x7 = self.conv7(x6)
+        # [route]
+        # layers = -1, -7
+        x7 = torch.cat([x7, x3], dim=1)
+        x8 = self.conv8(x7)
+        return x8
+
+
+class DownSample2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+        # r -2
+        self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=64, nblocks=2)
+
+        # s -3
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # r -1 -10
+        self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample3(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=128, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample4(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=256, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=512, nblocks=4)
+        self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class Neck(nn.Module):
+    def __init__(self, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        # SPP
+        self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2)
+
+        # R -1 -3 -5 -6
+        # SPP
+        self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # UP
+        self.upsample1 = Upsample()
+        # R 85
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # UP
+        self.upsample2 = Upsample()
+        # R 54
+        self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+
+    def forward(self, input, downsample4, downsample3, inference=False):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # SPP
+        m1 = self.maxpool1(x3)
+        m2 = self.maxpool2(x3)
+        m3 = self.maxpool3(x3)
+        spp = torch.cat([m3, m2, m1, x3], dim=1)
+        # SPP end
+        x4 = self.conv4(spp)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        # UP
+        up = self.upsample1(x7, downsample4.size(), self.inference)
+        # R 85
+        x8 = self.conv8(downsample4)
+        # R -1 -3
+        x8 = torch.cat([x8, up], dim=1)
+
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        # UP
+        up = self.upsample2(x14, downsample3.size(), self.inference)
+        # R 54
+        x15 = self.conv15(downsample3)
+        # R -1 -3
+        x15 = torch.cat([x15, up], dim=1)
+
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        x19 = self.conv19(x18)
+        x20 = self.conv20(x19)
+        return x20, x13, x6
+
+
+class Yolov4Head(nn.Module):
+    def __init__(self, output_ch, n_classes, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo1 = YoloLayer(
+                                anchor_mask=[0, 1, 2], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=8)
+
+        # R -4
+        self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky')
+
+        # R -1 -16
+        self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True)
+        
+        self.yolo2 = YoloLayer(
+                                anchor_mask=[3, 4, 5], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=16)
+
+        # R -4
+        self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky')
+
+        # R -1 -37
+        self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True)
+        
+        self.yolo3 = YoloLayer(
+                                anchor_mask=[6, 7, 8], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=32)
+
+    def forward(self, input1, input2, input3):
+        x1 = self.conv1(input1)
+        x2 = self.conv2(x1)
+
+        x3 = self.conv3(input1)
+        # R -1 -16
+        x3 = torch.cat([x3, input2], dim=1)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+
+        # R -4
+        x11 = self.conv11(x8)
+        # R -1 -37
+        x11 = torch.cat([x11, input3], dim=1)
+
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+        x15 = self.conv15(x14)
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        
+        if self.inference:
+            y1 = self.yolo1(x2)
+            y2 = self.yolo2(x10)
+            y3 = self.yolo3(x18)
+
+            return get_region_boxes([y1, y2, y3])
+        
+        else:
+            return [x2, x10, x18]
+
+
+class Yolov4(nn.Module):
+    def __init__(self, yolov4conv137weight=None, n_classes=80, inference=False):
+        super().__init__()
+
+        output_ch = (4 + 1 + n_classes) * 3
+
+        # backbone
+        self.down1 = DownSample1()
+        self.down2 = DownSample2()
+        self.down3 = DownSample3()
+        self.down4 = DownSample4()
+        self.down5 = DownSample5()
+        # neck
+        self.neek = Neck(inference)
+        # yolov4conv137
+        if yolov4conv137weight:
+            _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neek)
+            pretrained_dict = torch.load(yolov4conv137weight)
+
+            model_dict = _model.state_dict()
+            # 1. filter out unnecessary keys
+            pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)}
+            # 2. overwrite entries in the existing state dict
+            model_dict.update(pretrained_dict)
+            _model.load_state_dict(model_dict)
+        
+        # head
+        self.head = Yolov4Head(output_ch, n_classes, inference)
+
+
+    def forward(self, input):
+        pass
+        # d1 = self.down1(input)
+        # d2 = self.down2(d1)
+        #d3 = self.down3(input)
+        #d4 = self.down4(d3)
+        # d5 = self.down5(d4)
+        #
+        # x20, x13, x6 = self.neek(d5, d4, d3)
+        #
+        # output = self.head(x20, x13, x6)
+        # return output
+
+
+namesfile = 'data/coco.names'
+
+n_classes = 80
+weightfile = './yolov4.pth'
+height = 608
+width = 608
+
+model = Yolov4(yolov4conv137weight=None, n_classes=n_classes, inference=True)
+
+pretrained_dict = torch.load(weightfile, map_location=torch.device('cpu'))
+model.load_state_dict(pretrained_dict)
+
+model.eval()
+# time.sleep(10)
+
+from config_edge import trans_thread
+trans = trans_thread()
+trans.start()
+
+while 1:
+    d2 = trans.get_d2()
+    d2 = torch.from_numpy(d2)
+    print(d2.shape[0])
+    print(d2.shape[1])
+    print(d2.shape[2])
+    print(d2.shape[3])
+    d3 = model.down3(d2)
+    trans.put_d4(d3)
--- a/edges/edge3/Dockerfile
+++ b/edges/edge3/Dockerfile
+FROM ubuntu:18.04
+RUN apt-get -yqq update
+RUN  sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN  apt-get clean
+RUN apt-get -yqq update
+RUN apt-get install -yqq openssh-client openssh-server
+RUN echo 'root:PASSWORD' | chpasswd
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN service ssh restart
+
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:deadsnakes/ppa
+RUN apt-get install -y python3.9
+RUN apt-get autoremove -y python3
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+RUN ln -s /usr/bin/python3.9 /usr/bin/python3
+RUN apt-get install -y python3.9-distutils
+RUN apt-get install -y wget
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN python get-pip.py
+RUN pip3 -V
+RUN ln -s /usr/local/bin/pip3 /usr/bin/pip3
+RUN apt-get -yqq install libssl-dev libffi-dev gcc python3.9-dev libgl1-mesa-glx libsm6 libxext6 libglib2.0-0
+RUN apt-get -yqq update
+RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+
+ADD requirements.txt /edge3/requirements.txt
+#RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements.txt
+RUN pip3 install -r /edge3/requirements.txt
+WORKDIR /edge3
+ADD . /edge3
+CMD ["python", "models.py"]
\ No newline at end of file
--- a/edges/edge3/config_edge.py
+++ b/edges/edge3/config_edge.py
+import logging
+import threading
+import queue
+from socket import *
+import numpy as np
+import _thread
+import time
+
+qsize = 1
+ip_add = '127.0.0.1'
+server_port = 25003
+connect_port = 25002
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+
+
+def send_from(arr, dest):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nsent = dest.send(view)
+        view = view[nsent:]
+
+
+def recv_into(arr, source):
+    view = memoryview(arr).cast('B')
+    while len(view):
+        nrecv = source.recv_into(view)
+        view = view[nrecv:]
+
+
+def doConnect(host, port):
+    sock = socket(AF_INET, SOCK_STREAM)
+    sock.settimeout(20)
+    flag = True
+    while flag:
+        try:
+            if flag:
+                log.info("try connect %s : %d", host, port)
+                sock.connect((host, port))
+                flag = False
+        except Exception as e:
+            log.error("Address-related error connecting to server: %s" % e)
+        time.sleep(3)
+    return sock
+
+
+class trans_thread(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+        self.lock = threading.Lock()
+        self.d4Que = queue.Queue(qsize)
+        self.boxQue = queue.Queue(qsize)
+        self.server = socket(AF_INET, SOCK_STREAM)
+        self.server.bind(('', 25003))
+        self.server.listen(3)
+        log.info("bind %d", server_port)
+        self.a_client, _ = self.server.accept()
+        self.a_client.settimeout(5)
+        self.client = doConnect(ip_add, connect_port)
+        print('edge 3 init successfully')
+
+    def put_box(self, box):
+        while self.boxQue.full():
+            # print('box is full')
+            time.sleep(0.1)
+        self.lock.acquire()
+        self.boxQue.put((len(box[0]), np.array(box, dtype=np.float32)))
+        self.lock.release()
+
+    def get_d4(self):
+        while self.d4Que.empty():
+            print('d4 is empty')
+            time.sleep(0.1)
+        self.lock.acquire()
+        d4 = self.d4Que.get()
+        self.lock.release()
+        return d4
+
+    def recv(self):
+        arr = np.zeros(shape=(1, 256, 76, 76), dtype=np.float32)
+        while 1:
+            if not self.d4Que.full():
+                recv_into(arr, self.client)
+                self.lock.acquire()
+                self.d4Que.put(arr)
+                self.lock.release()
+            else:
+                # print('d2 is full')
+                time.sleep(0.1)
+
+    def send(self):
+        try:
+            while 1:
+                if not self.boxQue.empty():
+                    self.lock.acquire()
+                    lth, box = self.boxQue.get()
+                    self.lock.release()
+                    if lth == 0:
+                        send_from(np.zeros(1, dtype=np.uint8), self.a_client)
+                    else:
+                        lth = np.array([lth])
+                        send_from(lth, self.a_client)
+                        send_from(box, self.a_client)
+                else:
+                    # print('box is empty')
+                    time.sleep(0.1)
+        except Exception as e:
+            log.error("connecting error: %s" % e)
+            self.a_client, _ = self.server.accept()
+
+    def run(self):
+        _thread.start_new_thread(self.recv, ())
+        _thread.start_new_thread(self.send, ())
+
+# def print_time(threadName, delay, counter):
+#     while counter:
+#         if exitFlag:
+#             threadName.exit()
+#         time.sleep(delay)
+#         print ("%s: %s" % (threadName, time.ctime(time.time())))
+#         counter -= 1
--- a/edges/edge3/deployEdge3.yaml
+++ b/edges/edge3/deployEdge3.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: edge3
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: edge3
+  template:
+    metadata:
+      labels:
+        app: edge3
+    spec:
+      hostNetwork: true
+      nodeSelector:
+        kubernetes.io/hostname: node3
+      containers:
+      - name: edge3
+        image: k8s-master:5000/edge/edge3:v1
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 25003
+
+#---
+#apiVersion: v1
+#kind: Service
+#metadata:
+#  name: edge3
+#spec:
+#  type: NodePort
+#  selector:
+#    app: edge3
+#  ports:
+#    - name: tcp
+#      port: 32003
+#      targetPort: 25003
+#      nodePort: 32003
--- a/edges/edge3/run_edge.py
+++ b/edges/edge3/run_edge.py
+import logging
+import sys
+
+import torch.nn.functional as F
+from torch import nn
+
+from tool.torch_utils import *
+from tool.torch_utils import do_detect
+from tool.utils import load_class_names
+from tool.yolo_layer import YoloLayer
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self):
+        super(Upsample, self).__init__()
+
+    def forward(self, x, target_size, inference=False):
+        assert (x.data.dim() == 4)
+        # _, _, tH, tW = target_size
+
+        if inference:
+
+            # B = x.data.size(0)
+            # C = x.data.size(1)
+            # H = x.data.size(2)
+            # W = x.data.size(3)
+
+            return x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1). \
+                expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3),
+                       target_size[3] // x.size(3)). \
+                contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
+        else:
+            return F.interpolate(x, size=(target_size[2], target_size[3]), mode='nearest')
+
+
+class Conv_Bn_Activation(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False):
+        super().__init__()
+        pad = (kernel_size - 1) // 2
+
+        self.conv = nn.ModuleList()
+        if bias:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad))
+        else:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False))
+        if bn:
+            self.conv.append(nn.BatchNorm2d(out_channels))
+        if activation == "mish":
+            self.conv.append(Mish())
+        elif activation == "relu":
+            self.conv.append(nn.ReLU(inplace=True))
+        elif activation == "leaky":
+            self.conv.append(nn.LeakyReLU(0.1, inplace=True))
+        elif activation == "linear":
+            pass
+        else:
+            print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                       sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+    def forward(self, x):
+        for l in self.conv:
+            x = l(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    """
+    Sequential residual blocks each of which consists of \
+    two convolution layers.
+    Args:
+        ch (int): number of input and output channels.
+        nblocks (int): number of residual blocks.
+        shortcut (bool): if True, residual tensor addition is enabled.
+    """
+
+    def __init__(self, ch, nblocks=1, shortcut=True):
+        super().__init__()
+        self.shortcut = shortcut
+        self.module_list = nn.ModuleList()
+        for i in range(nblocks):
+            resblock_one = nn.ModuleList()
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish'))
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish'))
+            self.module_list.append(resblock_one)
+
+    def forward(self, x):
+        for module in self.module_list:
+            h = x
+            for res in module:
+                h = res(h)
+            x = x + h if self.shortcut else h
+        return x
+
+
+class DownSample1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish')
+
+        self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish')
+        self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -2
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+
+        self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish')
+        self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish')
+        # [shortcut]
+        # from=-3
+        # activation = linear
+
+        self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -1, -7
+        self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # route -2
+        x4 = self.conv4(x2)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        # shortcut -3
+        x6 = x6 + x4
+
+        x7 = self.conv7(x6)
+        # [route]
+        # layers = -1, -7
+        x7 = torch.cat([x7, x3], dim=1)
+        x8 = self.conv8(x7)
+        return x8
+
+
+class DownSample2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+        # r -2
+        self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=64, nblocks=2)
+
+        # s -3
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # r -1 -10
+        self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample3(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=128, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample4(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=256, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=512, nblocks=4)
+        self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class Neck(nn.Module):
+    def __init__(self, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        # SPP
+        self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2)
+
+        # R -1 -3 -5 -6
+        # SPP
+        self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # UP
+        self.upsample1 = Upsample()
+        # R 85
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # UP
+        self.upsample2 = Upsample()
+        # R 54
+        self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+
+    def forward(self, input, downsample4, downsample3, inference=False):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # SPP
+        m1 = self.maxpool1(x3)
+        m2 = self.maxpool2(x3)
+        m3 = self.maxpool3(x3)
+        spp = torch.cat([m3, m2, m1, x3], dim=1)
+        # SPP end
+        x4 = self.conv4(spp)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        # UP
+        up = self.upsample1(x7, downsample4.size(), self.inference)
+        # R 85
+        x8 = self.conv8(downsample4)
+        # R -1 -3
+        x8 = torch.cat([x8, up], dim=1)
+
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        # UP
+        up = self.upsample2(x14, downsample3.size(), self.inference)
+        # R 54
+        x15 = self.conv15(downsample3)
+        # R -1 -3
+        x15 = torch.cat([x15, up], dim=1)
+
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        x19 = self.conv19(x18)
+        x20 = self.conv20(x19)
+        return x20, x13, x6
+
+
+class Yolov4Head(nn.Module):
+    def __init__(self, output_ch, n_classes, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo1 = YoloLayer(
+            anchor_mask=[0, 1, 2], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=8)
+
+        # R -4
+        self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky')
+
+        # R -1 -16
+        self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo2 = YoloLayer(
+            anchor_mask=[3, 4, 5], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=16)
+
+        # R -4
+        self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky')
+
+        # R -1 -37
+        self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo3 = YoloLayer(
+            anchor_mask=[6, 7, 8], num_classes=n_classes,
+            anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+            num_anchors=9, stride=32)
+
+    def forward(self, input1, input2, input3):
+        x1 = self.conv1(input1)
+        x2 = self.conv2(x1)
+
+        x3 = self.conv3(input1)
+        # R -1 -16
+        x3 = torch.cat([x3, input2], dim=1)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+
+        # R -4
+        x11 = self.conv11(x8)
+        # R -1 -37
+        x11 = torch.cat([x11, input3], dim=1)
+
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+        x15 = self.conv15(x14)
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+
+        if self.inference:
+            y1 = self.yolo1(x2)
+            y2 = self.yolo2(x10)
+            y3 = self.yolo3(x18)
+
+            return get_region_boxes([y1, y2, y3])
+
+        else:
+            return [x2, x10, x18]
+
+
+class Yolov4(nn.Module):
+    def __init__(self, yolov4conv137weight=None, n_classes=80, inference=False):
+        super().__init__()
+
+        output_ch = (4 + 1 + n_classes) * 3
+
+        # backbone
+        self.down1 = DownSample1()
+        self.down2 = DownSample2()
+        self.down3 = DownSample3()
+        self.down4 = DownSample4()
+        self.down5 = DownSample5()
+        # neck
+        self.neek = Neck(inference)
+        # yolov4conv137
+        if yolov4conv137weight:
+            _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neek)
+            pretrained_dict = torch.load(yolov4conv137weight)
+
+            model_dict = _model.state_dict()
+            # 1. filter out unnecessary keys
+            pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)}
+            # 2. overwrite entries in the existing state dict
+            model_dict.update(pretrained_dict)
+            _model.load_state_dict(model_dict)
+
+        # head
+        self.head = Yolov4Head(output_ch, n_classes, inference)
+
+    def forward(self, d3):
+        d4 = self.down4(d3)
+        d5 = self.down5(d4)
+        x20, x13, x6 = self.neek(d5, d4, d3)
+        output = self.head(x20, x13, x6)
+        return output
+
+
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
+log = logging.getLogger(__name__)
+log.setLevel(logging.DEBUG)
+namesfile = 'data/coco.names'
+
+n_classes = 80
+weightfile = './yolov4.pth'
+height = 608
+width = 608
+
+model = Yolov4(yolov4conv137weight=None, n_classes=n_classes, inference=True)
+
+pretrained_dict = torch.load(weightfile, map_location=torch.device('cpu'))
+model.load_state_dict(pretrained_dict)
+
+model.eval()
+# time.sleep(10)
+
+from config_edge import trans_thread
+
+trans = trans_thread()
+trans.start()
+
+model.eval()
+class_names = load_class_names(namesfile)
+while 1:
+    d3 = trans.get_d4()
+    d3 = torch.from_numpy(d3)
+    log.info(d3.shape[0])
+    log.info(d3.shape[1])
+    log.info(d3.shape[2])
+    log.info(d3.shape[3])
+    boxes = do_detect(model, d3, 0.4, 0.6, False)
+    # print(boxes)
+    trans.put_box(boxes)
--- a/requirements.txt
+++ b/requirements.txt
+numpy==1.20.1
+torch==1.8.0
+tensorboardX==2.0
+matplotlib==3.3.4
+tqdm==4.43.0
+easydict==1.9
+Pillow==8.1.2
+scikit-image
+opencv_python
+pycocotools
+kubernetes
\ No newline at end of file
--- a/tool/__init__.py
+++ b/tool/__init__.py
--- a/tool/camera.py
+++ b/tool/camera.py
+# -*- coding: utf-8 -*-
+'''
+@Time          : 2020/04/26 15:48
+@Author        : Tianxiaomo
+@File          : camera.py
+@Noice         :
+@Modificattion :
+    @Author    :
+    @Time      :
+    @Detail    :
+
+'''
+from __future__ import division
+import cv2
+from tool.darknet2pytorch import Darknet
+import argparse
+from tool.utils import *
+from tool.torch_utils import *
+
+
+def arg_parse():
+    """
+    Parse arguements to the detect module
+
+    """
+
+    parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
+    parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25)
+    parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4)
+    parser.add_argument("--reso", dest='reso', help=
+    "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
+                        default="160", type=str)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    cfgfile = "cfg/yolov4.cfg"
+    weightsfile = "weight/yolov4.weights"
+
+    args = arg_parse()
+    confidence = float(args.confidence)
+    nms_thesh = float(args.nms_thresh)
+    CUDA = torch.cuda.is_available()
+    num_classes = 80
+    bbox_attrs = 5 + num_classes
+    class_names = load_class_names("data/coco.names")
+
+    model = Darknet(cfgfile)
+    model.load_weights(weightsfile)
+
+    if CUDA:
+        model.cuda()
+
+    model.eval()
+    cap = cv2.VideoCapture(0)
+
+    assert cap.isOpened(), 'Cannot capture source'
+
+    frames = 0
+    start = time.time()
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if ret:
+            sized = cv2.resize(frame, (model.width, model.height))
+            sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
+            boxes = do_detect(model, sized, 0.5, 0.4, CUDA)
+
+            orig_im = plot_boxes_cv2(frame, boxes, class_names=class_names)
+
+            cv2.imshow("frame", orig_im)
+            key = cv2.waitKey(1)
+            if key & 0xFF == ord('q'):
+                break
+            frames += 1
+            print("FPS of the video is {:5.2f}".format(frames / (time.time() - start)))
+        else:
+            break
--- a/tool/coco_annotation.py
+++ b/tool/coco_annotation.py
+# -*- coding: utf-8 -*-
+'''
+@Time          : 2020/05/08 11:45
+@Author        : Tianxiaomo
+@File          : coco_annotatin.py
+@Noice         :
+@Modificattion :
+    @Author    :
+    @Time      :
+    @Detail    :
+
+'''
+import json
+from collections import defaultdict
+from tqdm import tqdm
+import os
+
+"""hyper parameters"""
+json_file_path = 'E:/Dataset/mscoco2017/annotations/instances_train2017.json'
+images_dir_path = 'mscoco2017/train2017/'
+output_path = '../data/val.txt'
+
+"""load json file"""
+name_box_id = defaultdict(list)
+id_name = dict()
+with open(json_file_path, encoding='utf-8') as f:
+    data = json.load(f)
+
+"""generate labels"""
+images = data['images']
+annotations = data['annotations']
+for ant in tqdm(annotations):
+    id = ant['image_id']
+    # name = os.path.join(images_dir_path, images[id]['file_name'])
+    name = os.path.join(images_dir_path, '{:012d}.jpg'.format(id))
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+
+"""write to txt"""
+with open(output_path, 'w') as f:
+    for key in tqdm(name_box_id.keys()):
+        f.write(key)
+        box_infos = name_box_id[key]
+        for info in box_infos:
+            x_min = int(info[0][0])
+            y_min = int(info[0][1])
+            x_max = x_min + int(info[0][2])
+            y_max = y_min + int(info[0][3])
+
+            box_info = " %d,%d,%d,%d,%d" % (
+                x_min, y_min, x_max, y_max, int(info[1]))
+            f.write(box_info)
+        f.write('\n')
--- a/tool/config.py
+++ b/tool/config.py
+import torch
+from tool.torch_utils import convert2cpu
+
+
+def parse_cfg(cfgfile):
+    blocks = []
+    fp = open(cfgfile, 'r')
+    block = None
+    line = fp.readline()
+    while line != '':
+        line = line.rstrip()
+        if line == '' or line[0] == '#':
+            line = fp.readline()
+            continue
+        elif line[0] == '[':
+            if block:
+                blocks.append(block)
+            block = dict()
+            block['type'] = line.lstrip('[').rstrip(']')
+            # set default value
+            if block['type'] == 'convolutional':
+                block['batch_normalize'] = 0
+        else:
+            key, value = line.split('=')
+            key = key.strip()
+            if key == 'type':
+                key = '_type'
+            value = value.strip()
+            block[key] = value
+        line = fp.readline()
+
+    if block:
+        blocks.append(block)
+    fp.close()
+    return blocks
+
+
+def print_cfg(blocks):
+    print('layer     filters    size              input                output');
+    prev_width = 416
+    prev_height = 416
+    prev_filters = 3
+    out_filters = []
+    out_widths = []
+    out_heights = []
+    ind = -2
+    for block in blocks:
+        ind = ind + 1
+        if block['type'] == 'net':
+            prev_width = int(block['width'])
+            prev_height = int(block['height'])
+            continue
+        elif block['type'] == 'convolutional':
+            filters = int(block['filters'])
+            kernel_size = int(block['size'])
+            stride = int(block['stride'])
+            is_pad = int(block['pad'])
+            pad = (kernel_size - 1) // 2 if is_pad else 0
+            width = (prev_width + 2 * pad - kernel_size) // stride + 1
+            height = (prev_height + 2 * pad - kernel_size) // stride + 1
+            print('%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
+                height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'maxpool':
+            pool_size = int(block['size'])
+            stride = int(block['stride'])
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
+                filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'avgpool':
+            width = 1
+            height = 1
+            print('%5d %-6s                   %3d x %3d x%4d   ->  %3d' % (
+                ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'softmax':
+            print('%5d %-6s                                    ->  %3d' % (ind, 'softmax', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'cost':
+            print('%5d %-6s                                     ->  %3d' % (ind, 'cost', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'reorg':
+            stride = int(block['stride'])
+            filters = stride * stride * prev_filters
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'upsample':
+            stride = int(block['stride'])
+            filters = prev_filters
+            width = prev_width * stride
+            height = prev_height * stride
+            print('%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'route':
+            layers = block['layers'].split(',')
+            layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+            if len(layers) == 1:
+                print('%5d %-6s %d' % (ind, 'route', layers[0]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                prev_filters = out_filters[layers[0]]
+            elif len(layers) == 2:
+                print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]])
+                assert (prev_height == out_heights[layers[1]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+            elif len(layers) == 4:
+                print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
+                assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
+                    layers[3]]
+            else:
+                print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                        sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] in ['region', 'yolo']:
+            print('%5d %-6s' % (ind, 'detection'))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'shortcut':
+            from_id = int(block['from'])
+            from_id = from_id if from_id > 0 else from_id + ind
+            print('%5d %-6s %d' % (ind, 'shortcut', from_id))
+            prev_width = out_widths[from_id]
+            prev_height = out_heights[from_id]
+            prev_filters = out_filters[from_id]
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'connected':
+            filters = int(block['output'])
+            print('%5d %-6s                            %d  ->  %3d' % (ind, 'connected', prev_filters, filters))
+            prev_filters = filters
+            out_widths.append(1)
+            out_heights.append(1)
+            out_filters.append(prev_filters)
+        else:
+            print('unknown type %s' % (block['type']))
+
+
+def load_conv(buf, start, conv_model):
+    num_w = conv_model.weight.numel()
+    num_b = conv_model.bias.numel()
+    conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv(fp, conv_model):
+    if conv_model.bias.is_cuda:
+        convert2cpu(conv_model.bias.data).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        conv_model.bias.data.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_conv_bn(buf, start, conv_model, bn_model):
+    num_w = conv_model.weight.numel()
+    num_b = bn_model.bias.numel()
+    bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv_bn(fp, conv_model, bn_model):
+    if bn_model.bias.is_cuda:
+        convert2cpu(bn_model.bias.data).numpy().tofile(fp)
+        convert2cpu(bn_model.weight.data).numpy().tofile(fp)
+        convert2cpu(bn_model.running_mean).numpy().tofile(fp)
+        convert2cpu(bn_model.running_var).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        bn_model.bias.data.numpy().tofile(fp)
+        bn_model.weight.data.numpy().tofile(fp)
+        bn_model.running_mean.numpy().tofile(fp)
+        bn_model.running_var.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_fc(buf, start, fc_model):
+    num_w = fc_model.weight.numel()
+    num_b = fc_model.bias.numel()
+    fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
+    start = start + num_w
+    return start
+
+
+def save_fc(fp, fc_model):
+    fc_model.bias.data.numpy().tofile(fp)
+    fc_model.weight.data.numpy().tofile(fp)
+
+
+if __name__ == '__main__':
+    import sys
+
+    blocks = parse_cfg('cfg/yolo.cfg')
+    if len(sys.argv) == 2:
+        blocks = parse_cfg(sys.argv[1])
+    print_cfg(blocks)
--- a/tool/darknet2onnx.py
+++ b/tool/darknet2onnx.py
+import sys
+import torch
+from tool.darknet2pytorch import Darknet
+
+
+def transform_to_onnx(cfgfile, weightfile, batch_size=1):
+    model = Darknet(cfgfile)
+
+    model.print_network()
+    model.load_weights(weightfile)
+    print('Loading weights from %s... Done!' % (weightfile))
+
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
+
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
+
+    if dynamic:
+        x = torch.randn((1, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(model.height, model.width)
+        dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+    else:
+        x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, model.height, model.width)
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=None)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        transform_to_onnx(cfgfile, weightfile)
+    elif len(sys.argv) == 4:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        batch_size = int(sys.argv[3])
+        transform_to_onnx(cfgfile, weightfile, batch_size)
+    elif len(sys.argv) == 5:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        batch_size = int(sys.argv[3])
+        dynamic = True if sys.argv[4] == 'True' else False
+        transform_to_onnx(cfgfile, weightfile, batch_size, dynamic)
+    else:
+        print('Please execute this script this way:\n')
+        print('  python darknet2onnx.py <cfgFile> <weightFile>')
+        print('or')
+        print('  python darknet2onnx.py <cfgFile> <weightFile> <batchSize>')
--- a/tool/darknet2pytorch.py
+++ b/tool/darknet2pytorch.py
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tool.region_loss import RegionLoss
+from tool.yolo_layer import YoloLayer
+from tool.config import *
+from tool.torch_utils import *
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class MaxPoolDark(nn.Module):
+    def __init__(self, size=2, stride=1):
+        super(MaxPoolDark, self).__init__()
+        self.size = size
+        self.stride = stride
+
+    def forward(self, x):
+        '''
+        darknet output_size = (input_size + p - k) / s +1
+        p : padding = k - 1
+        k : size
+        s : stride
+        torch output_size = (input_size + 2*p -k) / s +1
+        p : padding = k//2
+        '''
+        p = self.size // 2
+        if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1 + 1
+        else:
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1
+        if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3 + 1
+        else:
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3
+        x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'),
+                         self.size, stride=self.stride)
+        return x
+
+
+class Upsample_expand(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample_expand, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+        
+        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
+            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
+
+        return x
+
+
+class Upsample_interpolate(nn.Module):
+    def __init__(self, stride):
+        super(Upsample_interpolate, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+
+        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
+        return out
+
+
+class Reorg(nn.Module):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert (x.data.dim() == 4)
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        assert (H % stride == 0)
+        assert (W % stride == 0)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
+        x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
+        x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
+        x = x.view(B, hs * ws * C, H / hs, W / ws)
+        return x
+
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, x):
+        N = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        x = F.avg_pool2d(x, (H, W))
+        x = x.view(N, C)
+        return x
+
+
+# for route and shortcut
+class EmptyModule(nn.Module):
+    def __init__(self):
+        super(EmptyModule, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+# support route shortcut and reorg
+class Darknet(nn.Module):
+    def __init__(self, cfgfile, inference=False):
+        super(Darknet, self).__init__()
+        self.inference = inference
+        self.training = not self.inference
+
+        self.blocks = parse_cfg(cfgfile)
+        self.width = int(self.blocks[0]['width'])
+        self.height = int(self.blocks[0]['height'])
+
+        self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
+        self.loss = self.models[len(self.models) - 1]
+
+        if self.blocks[(len(self.blocks) - 1)]['type'] == 'region':
+            self.anchors = self.loss.anchors
+            self.num_anchors = self.loss.num_anchors
+            self.anchor_step = self.loss.anchor_step
+            self.num_classes = self.loss.num_classes
+
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+
+    def forward(self, x):
+        ind = -2
+        self.loss = None
+        outputs = dict()
+        out_boxes = []
+        for block in self.blocks:
+            ind = ind + 1
+            # if ind > 0:
+            #    return x
+
+            if block['type'] == 'net':
+                continue
+            elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']:
+                x = self.models[ind](x)
+                outputs[ind] = x
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        x = outputs[layers[0]]
+                        outputs[ind] = x
+                    else:
+                        groups = int(block['groups'])
+                        group_id = int(block['group_id'])
+                        _, b, _, _ = outputs[layers[0]].shape
+                        x = outputs[layers[0]][:, b // groups * group_id:b // groups * (group_id + 1)]
+                        outputs[ind] = x
+                elif len(layers) == 2:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x = torch.cat((x1, x2), 1)
+                    outputs[ind] = x
+                elif len(layers) == 4:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x3 = outputs[layers[2]]
+                    x4 = outputs[layers[3]]
+                    x = torch.cat((x1, x2, x3, x4), 1)
+                    outputs[ind] = x
+                else:
+                    print("rounte number > 2 ,is {}".format(len(layers)))
+
+            elif block['type'] == 'shortcut':
+                from_layer = int(block['from'])
+                activation = block['activation']
+                from_layer = from_layer if from_layer > 0 else from_layer + ind
+                x1 = outputs[from_layer]
+                x2 = outputs[ind - 1]
+                x = x1 + x2
+                if activation == 'leaky':
+                    x = F.leaky_relu(x, 0.1, inplace=True)
+                elif activation == 'relu':
+                    x = F.relu(x, inplace=True)
+                outputs[ind] = x
+            elif block['type'] == 'region':
+                continue
+                if self.loss:
+                    self.loss = self.loss + self.models[ind](x)
+                else:
+                    self.loss = self.models[ind](x)
+                outputs[ind] = None
+            elif block['type'] == 'yolo':
+                # if self.training:
+                #     pass
+                # else:
+                #     boxes = self.models[ind](x)
+                #     out_boxes.append(boxes)
+                boxes = self.models[ind](x)
+                out_boxes.append(boxes)
+            elif block['type'] == 'cost':
+                continue
+            else:
+                print('unknown type %s' % (block['type']))
+
+        if self.training:
+            return out_boxes
+        else:
+            return get_region_boxes(out_boxes)
+
+    def print_network(self):
+        print_cfg(self.blocks)
+
+    def create_network(self, blocks):
+        models = nn.ModuleList()
+
+        prev_filters = 3
+        out_filters = []
+        prev_stride = 1
+        out_strides = []
+        conv_id = 0
+        for block in blocks:
+            if block['type'] == 'net':
+                prev_filters = int(block['channels'])
+                continue
+            elif block['type'] == 'convolutional':
+                conv_id = conv_id + 1
+                batch_normalize = int(block['batch_normalize'])
+                filters = int(block['filters'])
+                kernel_size = int(block['size'])
+                stride = int(block['stride'])
+                is_pad = int(block['pad'])
+                pad = (kernel_size - 1) // 2 if is_pad else 0
+                activation = block['activation']
+                model = nn.Sequential()
+                if batch_normalize:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False))
+                    model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters))
+                    # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
+                else:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad))
+                if activation == 'leaky':
+                    model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True))
+                elif activation == 'relu':
+                    model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True))
+                elif activation == 'mish':
+                    model.add_module('mish{0}'.format(conv_id), Mish())
+                else:
+                    print("convalution havn't activate {}".format(activation))
+
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'maxpool':
+                pool_size = int(block['size'])
+                stride = int(block['stride'])
+                if stride == 1 and pool_size % 2:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=3 stride=1
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
+                elif stride == pool_size:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=2 stride=2
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0)
+                else:
+                    model = MaxPoolDark(pool_size, stride)
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'avgpool':
+                model = GlobalAvgPool2d()
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'softmax':
+                model = nn.Softmax()
+                out_strides.append(prev_stride)
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'cost':
+                if block['_type'] == 'sse':
+                    model = nn.MSELoss(reduction='mean')
+                elif block['_type'] == 'L1':
+                    model = nn.L1Loss(reduction='mean')
+                elif block['_type'] == 'smooth':
+                    model = nn.SmoothL1Loss(reduction='mean')
+                out_filters.append(1)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'reorg':
+                stride = int(block['stride'])
+                prev_filters = stride * stride * prev_filters
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride * stride
+                out_strides.append(prev_stride)
+                models.append(Reorg(stride))
+            elif block['type'] == 'upsample':
+                stride = int(block['stride'])
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride // stride
+                out_strides.append(prev_stride)
+
+                models.append(Upsample_expand(stride))
+                # models.append(Upsample_interpolate(stride))
+
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                ind = len(models)
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        prev_filters = out_filters[layers[0]]
+                        prev_stride = out_strides[layers[0]]
+                    else:
+                        prev_filters = out_filters[layers[0]] // int(block['groups'])
+                        prev_stride = out_strides[layers[0]] // int(block['groups'])
+                elif len(layers) == 2:
+                    assert (layers[0] == ind - 1 or layers[1] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+                    prev_stride = out_strides[layers[0]]
+                elif len(layers) == 4:
+                    assert (layers[0] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \
+                                   out_filters[layers[3]]
+                    prev_stride = out_strides[layers[0]]
+                else:
+                    print("route error!!!")
+
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'shortcut':
+                ind = len(models)
+                prev_filters = out_filters[ind - 1]
+                out_filters.append(prev_filters)
+                prev_stride = out_strides[ind - 1]
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'connected':
+                filters = int(block['output'])
+                if block['activation'] == 'linear':
+                    model = nn.Linear(prev_filters, filters)
+                elif block['activation'] == 'leaky':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.LeakyReLU(0.1, inplace=True))
+                elif block['activation'] == 'relu':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.ReLU(inplace=True))
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'region':
+                loss = RegionLoss()
+                anchors = block['anchors'].split(',')
+                loss.anchors = [float(i) for i in anchors]
+                loss.num_classes = int(block['classes'])
+                loss.num_anchors = int(block['num'])
+                loss.anchor_step = len(loss.anchors) // loss.num_anchors
+                loss.object_scale = float(block['object_scale'])
+                loss.noobject_scale = float(block['noobject_scale'])
+                loss.class_scale = float(block['class_scale'])
+                loss.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(loss)
+            elif block['type'] == 'yolo':
+                yolo_layer = YoloLayer()
+                anchors = block['anchors'].split(',')
+                anchor_mask = block['mask'].split(',')
+                yolo_layer.anchor_mask = [int(i) for i in anchor_mask]
+                yolo_layer.anchors = [float(i) for i in anchors]
+                yolo_layer.num_classes = int(block['classes'])
+                self.num_classes = yolo_layer.num_classes
+                yolo_layer.num_anchors = int(block['num'])
+                yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors
+                yolo_layer.stride = prev_stride
+                yolo_layer.scale_x_y = float(block['scale_x_y'])
+                # yolo_layer.object_scale = float(block['object_scale'])
+                # yolo_layer.noobject_scale = float(block['noobject_scale'])
+                # yolo_layer.class_scale = float(block['class_scale'])
+                # yolo_layer.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(yolo_layer)
+            else:
+                print('unknown type %s' % (block['type']))
+
+        return models
+
+    def load_weights(self, weightfile):
+        fp = open(weightfile, 'rb')
+        header = np.fromfile(fp, count=5, dtype=np.int32)
+        self.header = torch.from_numpy(header)
+        self.seen = self.header[3]
+        buf = np.fromfile(fp, dtype=np.float32)
+        fp.close()
+
+        start = 0
+        ind = -2
+        for block in self.blocks:
+            if start >= buf.size:
+                break
+            ind = ind + 1
+            if block['type'] == 'net':
+                continue
+            elif block['type'] == 'convolutional':
+                model = self.models[ind]
+                batch_normalize = int(block['batch_normalize'])
+                if batch_normalize:
+                    start = load_conv_bn(buf, start, model[0], model[1])
+                else:
+                    start = load_conv(buf, start, model[0])
+            elif block['type'] == 'connected':
+                model = self.models[ind]
+                if block['activation'] != 'linear':
+                    start = load_fc(buf, start, model[0])
+                else:
+                    start = load_fc(buf, start, model)
+            elif block['type'] == 'maxpool':
+                pass
+            elif block['type'] == 'reorg':
+                pass
+            elif block['type'] == 'upsample':
+                pass
+            elif block['type'] == 'route':
+                pass
+            elif block['type'] == 'shortcut':
+                pass
+            elif block['type'] == 'region':
+                pass
+            elif block['type'] == 'yolo':
+                pass
+            elif block['type'] == 'avgpool':
+                pass
+            elif block['type'] == 'softmax':
+                pass
+            elif block['type'] == 'cost':
+                pass
+            else:
+                print('unknown type %s' % (block['type']))
+
+    # def save_weights(self, outfile, cutoff=0):
+    #     if cutoff <= 0:
+    #         cutoff = len(self.blocks) - 1
+    #
+    #     fp = open(outfile, 'wb')
+    #     self.header[3] = self.seen
+    #     header = self.header
+    #     header.numpy().tofile(fp)
+    #
+    #     ind = -1
+    #     for blockId in range(1, cutoff + 1):
+    #         ind = ind + 1
+    #         block = self.blocks[blockId]
+    #         if block['type'] == 'convolutional':
+    #             model = self.models[ind]
+    #             batch_normalize = int(block['batch_normalize'])
+    #             if batch_normalize:
+    #                 save_conv_bn(fp, model[0], model[1])
+    #             else:
+    #                 save_conv(fp, model[0])
+    #         elif block['type'] == 'connected':
+    #             model = self.models[ind]
+    #             if block['activation'] != 'linear':
+    #                 save_fc(fc, model)
+    #             else:
+    #                 save_fc(fc, model[0])
+    #         elif block['type'] == 'maxpool':
+    #             pass
+    #         elif block['type'] == 'reorg':
+    #             pass
+    #         elif block['type'] == 'upsample':
+    #             pass
+    #         elif block['type'] == 'route':
+    #             pass
+    #         elif block['type'] == 'shortcut':
+    #             pass
+    #         elif block['type'] == 'region':
+    #             pass
+    #         elif block['type'] == 'yolo':
+    #             pass
+    #         elif block['type'] == 'avgpool':
+    #             pass
+    #         elif block['type'] == 'softmax':
+    #             pass
+    #         elif block['type'] == 'cost':
+    #             pass
+    #         else:
+    #             print('unknown type %s' % (block['type']))
+    #     fp.close()
--- a/tool/onnx2tensorflow.py
+++ b/tool/onnx2tensorflow.py
+import sys
+import onnx
+from onnx_tf.backend import prepare
+
+
+# tensorflow >=2.0
+# 1: Thanks:github:https://github.com/onnx/onnx-tensorflow
+# 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
+#    Run pip install -e .
+# Note:
+#    Errors will occur when using "pip install onnx-tf", at least for me,
+#    it is recommended to use source code installation
+def transform_to_tensorflow(onnx_input_path, pb_output_path):
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # prepare tf representation
+    tf_exp.export_graph(pb_output_path)  # export the model
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        sys.argv.append('../weight/yolov4_1_3_608_608.onnx')  # use:darknet2onnx.py
+        sys.argv.append('../weight/yolov4.pb')  # use:onnx2tensorflow.py
+    if len(sys.argv) == 3:
+        onnxfile = sys.argv[1]
+        tfpb_outfile = sys.argv[2]
+        transform_to_tensorflow(onnxfile, tfpb_outfile)
+    else:
+        print('Please execute this script this way:\n')
+        print('  python onnx2tensorflow.py <onnxfile> <tfpboutfile>')
--- a/tool/region_loss.py
+++ b/tool/region_loss.py
+import torch.nn as nn
+import torch.nn.functional as F
+from tool.torch_utils import *
+
+
+def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
+                  sil_thresh, seen):
+    nB = target.size(0)
+    nA = num_anchors
+    nC = num_classes
+    anchor_step = len(anchors) / num_anchors
+    conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
+    coord_mask = torch.zeros(nB, nA, nH, nW)
+    cls_mask = torch.zeros(nB, nA, nH, nW)
+    tx = torch.zeros(nB, nA, nH, nW)
+    ty = torch.zeros(nB, nA, nH, nW)
+    tw = torch.zeros(nB, nA, nH, nW)
+    th = torch.zeros(nB, nA, nH, nW)
+    tconf = torch.zeros(nB, nA, nH, nW)
+    tcls = torch.zeros(nB, nA, nH, nW)
+
+    nAnchors = nA * nH * nW
+    nPixels = nH * nW
+    for b in range(nB):
+        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
+        cur_ious = torch.zeros(nAnchors)
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
+            cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
+        conf_mask[b][cur_ious > sil_thresh] = 0
+    if seen < 12800:
+        if anchor_step == 4:
+            tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
+                                                                                                              1).repeat(
+                nB, 1, nH, nW)
+            ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
+                1, nA, 1, 1).repeat(nB, 1, nH, nW)
+        else:
+            tx.fill_(0.5)
+            ty.fill_(0.5)
+        tw.zero_()
+        th.zero_()
+        coord_mask.fill_(1)
+
+    nGT = 0
+    nCorrect = 0
+    for b in range(nB):
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            nGT = nGT + 1
+            best_iou = 0.0
+            best_n = -1
+            min_dist = 10000
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gi = int(gx)
+            gj = int(gy)
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            gt_box = [0, 0, gw, gh]
+            for n in range(nA):
+                aw = anchors[anchor_step * n]
+                ah = anchors[anchor_step * n + 1]
+                anchor_box = [0, 0, aw, ah]
+                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
+                if anchor_step == 4:
+                    ax = anchors[anchor_step * n + 2]
+                    ay = anchors[anchor_step * n + 3]
+                    dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
+                if iou > best_iou:
+                    best_iou = iou
+                    best_n = n
+                elif anchor_step == 4 and iou == best_iou and dist < min_dist:
+                    best_iou = iou
+                    best_n = n
+                    min_dist = dist
+
+            gt_box = [gx, gy, gw, gh]
+            pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
+
+            coord_mask[b][best_n][gj][gi] = 1
+            cls_mask[b][best_n][gj][gi] = 1
+            conf_mask[b][best_n][gj][gi] = object_scale
+            tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
+            ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
+            tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
+            th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
+            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
+            tconf[b][best_n][gj][gi] = iou
+            tcls[b][best_n][gj][gi] = target[b][t * 5]
+            if iou > 0.5:
+                nCorrect = nCorrect + 1
+
+    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
+
+
+class RegionLoss(nn.Module):
+    def __init__(self, num_classes=0, anchors=[], num_anchors=1):
+        super(RegionLoss, self).__init__()
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) / num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.seen = 0
+
+    def forward(self, output, target):
+        # output : BxAs*(4+1+num_classes)*H*W
+        t0 = time.time()
+        nB = output.data.size(0)
+        nA = self.num_anchors
+        nC = self.num_classes
+        nH = output.data.size(2)
+        nW = output.data.size(3)
+
+        output = output.view(nB, nA, (5 + nC), nH, nW)
+        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
+        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
+        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
+        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
+        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
+        cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
+        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
+        t1 = time.time()
+
+        pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
+        grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
+        anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
+        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        pred_boxes[0] = x.data + grid_x
+        pred_boxes[1] = y.data + grid_y
+        pred_boxes[2] = torch.exp(w.data) * anchor_w
+        pred_boxes[3] = torch.exp(h.data) * anchor_h
+        pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
+        t2 = time.time()
+
+        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
+                                                                                                    target.data,
+                                                                                                    self.anchors, nA,
+                                                                                                    nC, \
+                                                                                                    nH, nW,
+                                                                                                    self.noobject_scale,
+                                                                                                    self.object_scale,
+                                                                                                    self.thresh,
+                                                                                                    self.seen)
+        cls_mask = (cls_mask == 1)
+        nProposals = int((conf > 0.25).sum().data[0])
+
+        tx = Variable(tx.cuda())
+        ty = Variable(ty.cuda())
+        tw = Variable(tw.cuda())
+        th = Variable(th.cuda())
+        tconf = Variable(tconf.cuda())
+        tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
+
+        coord_mask = Variable(coord_mask.cuda())
+        conf_mask = Variable(conf_mask.cuda().sqrt())
+        cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
+        cls = cls[cls_mask].view(-1, nC)
+
+        t3 = time.time()
+
+        loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0
+        loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0
+        loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0
+        loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0
+        loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0
+        loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls)
+        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+        t4 = time.time()
+        if False:
+            print('-----------------------------------')
+            print('        activation : %f' % (t1 - t0))
+            print(' create pred_boxes : %f' % (t2 - t1))
+            print('     build targets : %f' % (t3 - t2))
+            print('       create loss : %f' % (t4 - t3))
+            print('             total : %f' % (t4 - t0))
+        print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
+        self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
+        loss_conf.data[0], loss_cls.data[0], loss.data[0]))
+        return loss
--- a/tool/torch_utils.py
+++ b/tool/torch_utils.py
+import sys
+import os
+import time
+import math
+import torch
+import numpy as np
+from torch.autograd import Variable
+
+import itertools
+import struct  # get_image_size
+import imghdr  # get_image_size
+
+from tool import utils 
+
+
+def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = torch.min(boxes1[0], boxes2[0])
+        Mx = torch.max(boxes1[2], boxes2[2])
+        my = torch.min(boxes1[1], boxes2[1])
+        My = torch.max(boxes1[3], boxes2[3])
+        w1 = boxes1[2] - boxes1[0]
+        h1 = boxes1[3] - boxes1[1]
+        w2 = boxes2[2] - boxes2[0]
+        h2 = boxes2[3] - boxes2[1]
+    else:
+        mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
+        Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
+        my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
+        My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
+        w1 = boxes1[2]
+        h1 = boxes1[3]
+        w2 = boxes2[2]
+        h2 = boxes2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    mask = ((cw <= 0) + (ch <= 0) > 0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def get_region_boxes(boxes_and_confs):
+
+    # print('Getting boxes from boxes and confs ...')
+
+    boxes_list = []
+    confs_list = []
+
+    for item in boxes_and_confs:
+        boxes_list.append(item[0])
+        confs_list.append(item[1])
+
+    # boxes: [batch, num1 + num2 + num3, 1, 4]
+    # confs: [batch, num1 + num2 + num3, num_classes]
+    boxes = torch.cat(boxes_list, dim=1)
+    confs = torch.cat(confs_list, dim=1)
+        
+    return [boxes, confs]
+
+
+def convert2cpu(gpu_matrix):
+    return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+def convert2cpu_long(gpu_matrix):
+    return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+
+def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
+    model.eval()
+
+    if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
+        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+    elif type(img) == np.ndarray and len(img.shape) == 4:
+        img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
+    else:
+        print("unknow image type")
+        exit(-1)
+
+    if use_cuda:
+        img = img.cuda()
+    img = torch.autograd.Variable(img)
+
+    output = model(img)
+
+    return utils.post_processing(img, conf_thresh, nms_thresh, output)
+
--- a/tool/tv_reference/README.md
+++ b/tool/tv_reference/README.md
+# Object detection reference training scripts
+
+This folder contains reference training scripts for object detection.
+They serve as a log of how to train specific models, to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+
+To execute the example commands below you must install the following:
+
+```
+cython
+pycocotools
+matplotlib
+```
+
+You must modify the following flags:
+
+`--data-path=/path/to/coco/dataset`
+
+`--nproc_per_node=<number_of_gpus_available>`
+
+Except otherwise noted, all models have been trained on 8x V100 GPUs. 
+
+### Faster R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco --model fasterrcnn_resnet50_fpn --epochs 26\
+    --lr-steps 16 22 --aspect-ratio-group-factor 3
+```
+
+
+### Mask R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco --model maskrcnn_resnet50_fpn --epochs 26\
+    --lr-steps 16 22 --aspect-ratio-group-factor 3
+```
+
+
+### Keypoint R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco_kp --model keypointrcnn_resnet50_fpn --epochs 46\
+    --lr-steps 36 43 --aspect-ratio-group-factor 3
+```
+
--- a/tool/tv_reference/coco_eval.py
+++ b/tool/tv_reference/coco_eval.py
+import json
+import tempfile
+
+import numpy as np
+import copy
+import time
+import torch
+import torch._six
+
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+
+from collections import defaultdict
+
+from . import utils
+
+
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types, bbox_fmt='coco'):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.bbox_fmt = bbox_fmt.lower()
+        assert self.bbox_fmt in ['voc', 'coco', 'yolo']
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            coco_dt = loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            
+            if self.bbox_fmt == 'coco':
+                boxes = prediction["boxes"].tolist()
+            else:
+                boxes = prediction["boxes"]
+                boxes = convert_to_xywh(boxes, fmt=self.bbox_fmt).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            # boxes = prediction["boxes"]
+            # boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes, fmt='voc'):
+    if fmt.lower() == 'voc':
+        xmin, ymin, xmax, ymax = boxes.unbind(1)
+        return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+    elif fmt.lower() == 'yolo':
+        xcen, ycen, w, h = boxes.unbind(1)
+        return torch.stack((xcen-w/2, ycen-h/2, w, h), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = utils.all_gather(img_ids)
+    all_eval_imgs = utils.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+# Ideally, pycocotools wouldn't have hard-coded prints
+# so that we could avoid copy-pasting those two functions
+
+def createIndex(self):
+    # create index
+    # print('creating index...')
+    anns, cats, imgs = {}, {}, {}
+    imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+    if 'annotations' in self.dataset:
+        for ann in self.dataset['annotations']:
+            imgToAnns[ann['image_id']].append(ann)
+            anns[ann['id']] = ann
+
+    if 'images' in self.dataset:
+        for img in self.dataset['images']:
+            imgs[img['id']] = img
+
+    if 'categories' in self.dataset:
+        for cat in self.dataset['categories']:
+            cats[cat['id']] = cat
+
+    if 'annotations' in self.dataset and 'categories' in self.dataset:
+        for ann in self.dataset['annotations']:
+            catToImgs[ann['category_id']].append(ann['image_id'])
+
+    # print('index created!')
+
+    # create class members
+    self.anns = anns
+    self.imgToAnns = imgToAnns
+    self.catToImgs = catToImgs
+    self.imgs = imgs
+    self.cats = cats
+
+
+maskUtils = mask_util
+
+
+def loadRes(self, resFile):
+    """
+    Load result file and return a result api object.
+    :param   resFile (str)     : file name of result file
+    :return: res (obj)         : result api object
+    """
+    res = COCO()
+    res.dataset['images'] = [img for img in self.dataset['images']]
+
+    # print('Loading and preparing results...')
+    # tic = time.time()
+    if isinstance(resFile, torch._six.string_classes):
+        anns = json.load(open(resFile))
+    elif type(resFile) == np.ndarray:
+        anns = self.loadNumpyAnnotations(resFile)
+    else:
+        anns = resFile
+    assert type(anns) == list, 'results in not an array of objects'
+    annsImgIds = [ann['image_id'] for ann in anns]
+    assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+        'Results do not correspond to current coco set'
+    if 'caption' in anns[0]:
+        imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+        res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+        for id, ann in enumerate(anns):
+            ann['id'] = id + 1
+    elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            ann['bbox'] = ann['bbox'][0]
+            bb = ann['bbox']
+            x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+            if 'segmentation' not in ann:
+                ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+            ann['area'] = bb[2] * bb[3]
+            ann['id'] = id + 1
+            ann['iscrowd'] = 0
+    elif 'segmentation' in anns[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            # now only support compressed RLE format as segmentation results
+            ann['area'] = maskUtils.area(ann['segmentation'])
+            if 'bbox' not in ann:
+                ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+            ann['id'] = id + 1
+            ann['iscrowd'] = 0
+    elif 'keypoints' in anns[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            s = ann['keypoints']
+            x = s[0::3]
+            y = s[1::3]
+            x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
+            ann['area'] = (x2 - x1) * (y2 - y1)
+            ann['id'] = id + 1
+            ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    # print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+
+    res.dataset['annotations'] = anns
+    createIndex(res)
+    return res
+
+
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
--- a/tool/tv_reference/coco_utils.py
+++ b/tool/tv_reference/coco_utils.py
+import copy
+import os
+from PIL import Image
+
+import torch
+import torch.utils.data
+import torchvision
+
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+from . import transforms as T
+
+
+class FilterAndRemapCocoCategories(object):
+    def __init__(self, categories, remap=True):
+        self.categories = categories
+        self.remap = remap
+
+    def __call__(self, image, target):
+        anno = target["annotations"]
+        anno = [obj for obj in anno if obj["category_id"] in self.categories]
+        if not self.remap:
+            target["annotations"] = anno
+            return image, target
+        anno = copy.deepcopy(anno)
+        for obj in anno:
+            obj["category_id"] = self.categories.index(obj["category_id"])
+        target["annotations"] = anno
+        return image, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different critera for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    assert isinstance(dataset, torchvision.datasets.CocoDetection)
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds, bbox_fmt='voc'):
+    """
+    """
+    print("in function convert_to_coco_api...")
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {'images': [], 'categories': [], 'annotations': []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        img, targets = ds[img_idx]
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict['id'] = image_id
+        img_dict['height'] = img.shape[-2]
+        img_dict['width'] = img.shape[-1]
+        dataset['images'].append(img_dict)
+        bboxes = targets["boxes"]
+        # to coco format: xmin, ymin, w, h
+        if bbox_fmt.lower() == "voc":  # xmin, ymin, xmax, ymax
+            bboxes[:, 2:] -= bboxes[:, :2]
+        elif bbox_fmt.lower() == "yolo":  # xcen, ycen, w, h
+            bboxes[:, :2] = bboxes[:, :2] - bboxes[:, 2:]/2
+        elif bbox_fmt.lower() == "coco":
+            pass
+        else:
+            raise ValueError(f"bounding box format {bbox_fmt} not supported!")
+        bboxes = bboxes.tolist()
+        labels = targets['labels'].tolist()
+        areas = targets['area'].tolist()
+        iscrowd = targets['iscrowd'].tolist()
+        if 'masks' in targets:
+            masks = targets['masks']
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if 'keypoints' in targets:
+            keypoints = targets['keypoints']
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann['image_id'] = image_id
+            ann['bbox'] = bboxes[i]
+            ann['category_id'] = labels[i]
+            categories.add(labels[i])
+            ann['area'] = areas[i]
+            ann['iscrowd'] = iscrowd[i]
+            ann['id'] = ann_id
+            if 'masks' in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if 'keypoints' in targets:
+                ann['keypoints'] = keypoints[i]
+                ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset['annotations'].append(ann)
+            ann_id += 1
+    dataset['categories'] = [{'id': i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = dict(image_id=image_id, annotations=target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def get_coco(root, image_set, transforms, mode='instances'):
+    anno_file_template = "{}_{}2017.json"
+    PATHS = {
+        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
+        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
+        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
+    }
+
+    t = [ConvertCocoPolysToMask()]
+
+    if transforms is not None:
+        t.append(transforms)
+    transforms = T.Compose(t)
+
+    img_folder, ann_file = PATHS[image_set]
+    img_folder = os.path.join(root, img_folder)
+    ann_file = os.path.join(root, ann_file)
+
+    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+
+    if image_set == "train":
+        dataset = _coco_remove_images_without_annotations(dataset)
+
+    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
+
+    return dataset
+
+
+def get_coco_kp(root, image_set, transforms):
+    return get_coco(root, image_set, transforms, mode="person_keypoints")
--- a/tool/tv_reference/engine.py
+++ b/tool/tv_reference/engine.py
+import math
+import sys
+import time
+import torch
+
+import torchvision.models.detection.mask_rcnn
+
+from .coco_utils import get_coco_api_from_dataset
+from .coco_eval import CocoEvaluator
+from . import utils
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    lr_scheduler = None
+    if epoch == 0:
+        warmup_factor = 1. / 1000
+        warmup_iters = min(1000, len(data_loader) - 1)
+
+        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
+
+    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
+        images = list(image.to(device) for image in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        loss_dict = model(images, targets)
+
+        losses = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        loss_value = losses_reduced.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        optimizer.zero_grad()
+        losses.backward()
+        optimizer.step()
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    return metric_logger
+
+
+def _get_iou_types(model):
+    model_without_ddp = model
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model_without_ddp = model.module
+    iou_types = ["bbox"]
+    if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
+        iou_types.append("segm")
+    if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
+        iou_types.append("keypoints")
+    return iou_types
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device):
+    n_threads = torch.get_num_threads()
+    # FIXME remove this and make paste_masks_in_image run on the GPU
+    torch.set_num_threads(1)
+    cpu_device = torch.device("cpu")
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    coco = get_coco_api_from_dataset(data_loader.dataset)
+    iou_types = _get_iou_types(model)
+    coco_evaluator = CocoEvaluator(coco, iou_types)
+
+    for images, targets in metric_logger.log_every(data_loader, 100, header):
+        images = list(img.to(device) for img in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        torch.cuda.synchronize()
+        model_time = time.time()
+        outputs = model(images)
+
+        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+        model_time = time.time() - model_time
+
+        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        evaluator_time = time.time()
+        coco_evaluator.update(res)
+        evaluator_time = time.time() - evaluator_time
+        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    torch.set_num_threads(n_threads)
+    return coco_evaluator
--- a/tool/tv_reference/group_by_aspect_ratio.py
+++ b/tool/tv_reference/group_by_aspect_ratio.py
+import bisect
+from collections import defaultdict
+import copy
+from itertools import repeat, chain
+import math
+import numpy as np
+
+import torch
+import torch.utils.data
+from torch.utils.data.sampler import BatchSampler, Sampler
+from torch.utils.model_zoo import tqdm
+import torchvision
+
+from PIL import Image
+
+
+def _repeat_to_at_least(iterable, n):
+    repeat_times = math.ceil(n / len(iterable))
+    repeated = chain.from_iterable(repeat(iterable, repeat_times))
+    return list(repeated)
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id]
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, take first the buffers with largest number
+            # of elements
+            for group_id, _ in sorted(buffer_per_group.items(),
+                                      key=lambda x: len(x[1]), reverse=True):
+                remaining = self.batch_size - len(buffer_per_group[group_id])
+                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
+                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
+                assert len(buffer_per_group[group_id]) == self.batch_size
+                yield buffer_per_group[group_id]
+                num_remaining -= 1
+                if num_remaining == 0:
+                    break
+        assert num_remaining == 0
+
+    def __len__(self):
+        return len(self.sampler) // self.batch_size
+
+
+def _compute_aspect_ratios_slow(dataset, indices=None):
+    print("Your dataset doesn't support the fast path for "
+          "computing the aspect ratios, so will iterate over "
+          "the full dataset and load every image instead. "
+          "This might take some time...")
+    if indices is None:
+        indices = range(len(dataset))
+
+    class SubsetSampler(Sampler):
+        def __init__(self, indices):
+            self.indices = indices
+
+        def __iter__(self):
+            return iter(self.indices)
+
+        def __len__(self):
+            return len(self.indices)
+
+    sampler = SubsetSampler(indices)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_size=1, sampler=sampler,
+        num_workers=14,  # you might want to increase it for faster processing
+        collate_fn=lambda x: x[0])
+    aspect_ratios = []
+    with tqdm(total=len(dataset)) as pbar:
+        for _i, (img, _) in enumerate(data_loader):
+            pbar.update(1)
+            height, width = img.shape[-2:]
+            aspect_ratio = float(width) / float(height)
+            aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        height, width = dataset.get_height_and_width(i)
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        img_info = dataset.coco.imgs[dataset.ids[i]]
+        aspect_ratio = float(img_info["width"]) / float(img_info["height"])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        # this doesn't load the data into memory, because PIL loads it lazily
+        width, height = Image.open(dataset.images[i]).size
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+
+    ds_indices = [dataset.indices[i] for i in indices]
+    return compute_aspect_ratios(dataset.dataset, ds_indices)
+
+
+def compute_aspect_ratios(dataset, indices=None):
+    if hasattr(dataset, "get_height_and_width"):
+        return _compute_aspect_ratios_custom_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return _compute_aspect_ratios_coco_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.VOCDetection):
+        return _compute_aspect_ratios_voc_dataset(dataset, indices)
+
+    if isinstance(dataset, torch.utils.data.Subset):
+        return _compute_aspect_ratios_subset_dataset(dataset, indices)
+
+    # slow path
+    return _compute_aspect_ratios_slow(dataset, indices)
+
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def create_aspect_ratio_groups(dataset, k=0):
+    aspect_ratios = compute_aspect_ratios(dataset)
+    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
+    groups = _quantize(aspect_ratios, bins)
+    # count number of elements per group
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    print("Using {} as bins for aspect ratio quantization".format(fbins))
+    print("Count of instances per bin: {}".format(counts))
+    return groups
--- a/tool/tv_reference/train.py
+++ b/tool/tv_reference/train.py
+r"""PyTorch Detection Training.
+
+To run in a multi-gpu environment, use the distributed launcher::
+
+    python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
+        train.py ... --world-size $NGPU
+
+The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
+    --lr 0.02 --batch-size 2 --world-size 8
+If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
+
+On top of that, for training Faster/Mask R-CNN, the default hyperparameters are
+    --epochs 26 --lr-steps 16 22 --aspect-ratio-group-factor 3
+
+Also, if you train Keypoint R-CNN, the default hyperparameters are
+    --epochs 46 --lr-steps 36 43 --aspect-ratio-group-factor 3
+Because the number of images is smaller in the person keypoint subset of COCO,
+the number of epochs should be adapted so that we have the same number of iterations.
+"""
+import datetime
+import os
+import time
+
+import torch
+import torch.utils.data
+from torch import nn
+import torchvision
+import torchvision.models.detection
+import torchvision.models.detection.mask_rcnn
+
+from .coco_utils import get_coco, get_coco_kp
+
+from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
+from .engine import train_one_epoch, evaluate
+
+from . import utils
+from . import transforms as T
+
+
+def get_dataset(name, image_set, transform, data_path):
+    paths = {
+        "coco": (data_path, get_coco, 91),
+        "coco_kp": (data_path, get_coco_kp, 2)
+    }
+    p, ds_fn, num_classes = paths[name]
+
+    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    return ds, num_classes
+
+
+def get_transform(train):
+    transforms = []
+    transforms.append(T.ToTensor())
+    if train:
+        transforms.append(T.RandomHorizontalFlip(0.5))
+    return T.Compose(transforms)
+
+
+def main(args):
+    utils.init_distributed_mode(args)
+    print(args)
+
+    device = torch.device(args.device)
+
+    # Data loading code
+    print("Loading data")
+
+    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
+    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
+
+    print("Creating data loaders")
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
+    else:
+        train_sampler = torch.utils.data.RandomSampler(dataset)
+        test_sampler = torch.utils.data.SequentialSampler(dataset_test)
+
+    if args.aspect_ratio_group_factor >= 0:
+        group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+    else:
+        train_batch_sampler = torch.utils.data.BatchSampler(
+            train_sampler, args.batch_size, drop_last=True)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
+        collate_fn=utils.collate_fn)
+
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, batch_size=1,
+        sampler=test_sampler, num_workers=args.workers,
+        collate_fn=utils.collate_fn)
+
+    print("Creating model")
+    model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
+                                                              pretrained=args.pretrained)
+    model.to(device)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(
+        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+
+    if args.test_only:
+        evaluate(model, data_loader_test, device=device)
+        return
+
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
+        lr_scheduler.step()
+        if args.output_dir:
+            utils.save_on_master({
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'args': args,
+                'epoch': epoch},
+                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
+
+        # evaluate after every epoch
+        evaluate(model, data_loader_test, device=device)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset')
+    parser.add_argument('--dataset', default='coco', help='dataset')
+    parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model')
+    parser.add_argument('--device', default='cuda', help='device')
+    parser.add_argument('-b', '--batch-size', default=2, type=int,
+                        help='images per gpu, the total batch size is $NGPU x batch_size')
+    parser.add_argument('--epochs', default=26, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    parser.add_argument('--lr', default=0.02, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                        'on 8 gpus and 2 images_per_gpu')
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
+    parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, help='decrease lr every step-size epochs')
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
+    parser.add_argument('--output-dir', default='.', help='path where to save')
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--pretrained",
+        dest="pretrained",
+        help="Use pre-trained models from the modelzoo",
+        action="store_true",
+    )
+
+    # distributed training parameters
+    parser.add_argument('--world-size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        utils.mkdir(args.output_dir)
+
+    main(args)
--- a/tool/tv_reference/transforms.py
+++ b/tool/tv_reference/transforms.py
+import random
+import torch
+
+from torchvision.transforms import functional as F
+
+
+def _flip_coco_person_keypoints(kps, width):
+    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+    flipped_data = kps[:, flip_inds]
+    flipped_data[..., 0] = width - flipped_data[..., 0]
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    inds = flipped_data[..., 2] == 0
+    flipped_data[inds] = 0
+    return flipped_data
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, prob):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            height, width = image.shape[-2:]
+            image = image.flip(-1)
+            bbox = target["boxes"]
+            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
+            target["boxes"] = bbox
+            if "masks" in target:
+                target["masks"] = target["masks"].flip(-1)
+            if "keypoints" in target:
+                keypoints = target["keypoints"]
+                keypoints = _flip_coco_person_keypoints(keypoints, width)
+                target["keypoints"] = keypoints
+        return image, target
+
+
+class ToTensor(object):
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        return image, target
--- a/tool/tv_reference/utils.py
+++ b/tool/tv_reference/utils.py
+from collections import defaultdict, deque
+import datetime
+import pickle
+import time
+
+import torch
+import torch.distributed as dist
+
+import errno
+import os
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def collate_fn(batch):
+    return tuple(zip(*batch))
+
+
+def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
+
+    def f(x):
+        if x >= warmup_iters:
+            return 1
+        alpha = float(x) / warmup_iters
+        return warmup_factor * (1 - alpha) + alpha
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
--- a/tool/utils.py
+++ b/tool/utils.py
+import sys
+import os
+import time
+import math
+import numpy as np
+
+import itertools
+import struct  # get_image_size
+import imghdr  # get_image_size
+
+
+def sigmoid(x):
+    return 1.0 / (np.exp(-x) + 1.)
+
+
+def softmax(x):
+    x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
+    x = x / np.expand_dims(x.sum(axis=1), axis=1)
+    return x
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    
+    # print('iou box1:', box1)
+    # print('iou box2:', box2)
+
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[0] + w1, box2[0] + w2)
+        my = min(box1[1], box2[1])
+        My = max(box1[1] + h1, box2[1] + h2)
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return 0.0
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+    
+    return np.array(keep)
+
+
+
+def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None):
+    import cv2
+    img = np.copy(img)
+    colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
+
+    def get_color(c, x, max_val):
+        ratio = float(x) / max_val * 5
+        i = int(math.floor(ratio))
+        j = int(math.ceil(ratio))
+        ratio = ratio - i
+        r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
+        return int(r * 255)
+
+    width = img.shape[1]
+    height = img.shape[0]
+    for i in range(len(boxes)):
+        box = boxes[i]
+        x1 = int(box[0] * width)
+        y1 = int(box[1] * height)
+        x2 = int(box[2] * width)
+        y2 = int(box[3] * height)
+
+        if color:
+            rgb = color
+        else:
+            rgb = (255, 0, 0)
+        if len(box) >= 7 and class_names:
+            cls_conf = box[5]
+            cls_id = box[6]
+            print('%s: %f' % (class_names[cls_id], cls_conf))
+            classes = len(class_names)
+            offset = cls_id * 123457 % classes
+            red = get_color(2, offset, classes)
+            green = get_color(1, offset, classes)
+            blue = get_color(0, offset, classes)
+            if color is None:
+                rgb = (red, green, blue)
+            img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
+        img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
+    if savename:
+        print("save plot results to %s" % savename)
+        cv2.imwrite(savename, img)
+    return img
+
+
+def read_truths(lab_path):
+    if not os.path.exists(lab_path):
+        return np.array([])
+    if os.path.getsize(lab_path):
+        truths = np.loadtxt(lab_path)
+        truths = truths.reshape(truths.size / 5, 5)  # to avoid single truth problem
+        return truths
+    else:
+        return np.array([])
+
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+
+
+def post_processing(img, conf_thresh, nms_thresh, output):
+
+    # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
+    # num_anchors = 9
+    # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    # strides = [8, 16, 32]
+    # anchor_step = len(anchors) // num_anchors
+
+    # [batch, num, 1, 4]
+    box_array = output[0]
+    # [batch, num, num_classes]
+    confs = output[1]
+
+    t1 = time.time()
+
+    if type(box_array).__name__ != 'ndarray':
+        box_array = box_array.cpu().detach().numpy()
+        confs = confs.cpu().detach().numpy()
+
+    num_classes = confs.shape[2]
+
+    # [batch, num, 4]
+    box_array = box_array[:, :, 0]
+
+    # [batch, num, num_classes] --> [batch, num]
+    max_conf = np.max(confs, axis=2)
+    max_id = np.argmax(confs, axis=2)
+
+    t2 = time.time()
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+       
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+            
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+        
+        bboxes_batch.append(bboxes)
+
+    t3 = time.time()
+
+    print('-----------------------------------')
+    print('       max and argmax : %f' % (t2 - t1))
+    print('                  nms : %f' % (t3 - t2))
+    print('Post processing total : %f' % (t3 - t1))
+    print('-----------------------------------')
+    
+    return bboxes_batch
--- a/tool/utils_iou.py
+++ b/tool/utils_iou.py
+# -*- coding: utf-8 -*-
+'''
+
+'''
+import torch
+import os, sys
+from torch.nn import functional as F
+
+import numpy as np
+from packaging import version
+
+
+__all__ = [
+    "bboxes_iou",
+    "bboxes_giou",
+    "bboxes_diou",
+    "bboxes_ciou",
+]
+
+
+if version.parse(torch.__version__) >= version.parse('1.5.0'):
+    def _true_divide(dividend, divisor):
+        return torch.true_divide(dividend, divisor)
+else:
+    def _true_divide(dividend, divisor):
+        return dividend / divisor
+
+def bboxes_iou(bboxes_a, bboxes_b, fmt='voc', iou_type='iou'):
+    """Calculate the Intersection of Unions (IoUs) between bounding boxes.
+    IoU is calculated as a ratio of area of the intersection
+    and area of the union.
+
+    Args:
+        bbox_a (array): An array whose shape is :math:`(N, 4)`.
+            :math:`N` is the number of bounding boxes.
+            The dtype should be :obj:`numpy.float32`.
+        bbox_b (array): An array similar to :obj:`bbox_a`,
+            whose shape is :math:`(K, 4)`.
+            The dtype should be :obj:`numpy.float32`.
+    Returns:
+        array:
+        An array whose shape is :math:`(N, K)`. \
+        An element at index :math:`(n, k)` contains IoUs between \
+        :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
+        box in :obj:`bbox_b`.
+
+    from: https://github.com/chainer/chainercv
+    """
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        ) # of shape `(N,K,2)`
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, 2:]
+        )
+        bb_a = bboxes_a[:, 2:] - bboxes_a[:, :2]
+        bb_b = bboxes_b[:, 2:] - bboxes_b[:, :2]
+        # bb_* can also be seen vectors representing box_width, box_height
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] - bboxes_b[:, 2:] / 2
+        )
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] + bboxes_b[:, 2:] / 2
+        )
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        )
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, :2] + bboxes_b[:, 2:]
+        )
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    
+    area_a = torch.prod(bb_a, 1)
+    area_b = torch.prod(bb_b, 1)
+    
+    # torch.prod(input, dim, keepdim=False, dtype=None) → Tensor
+    # Returns the product of each row of the input tensor in the given dimension dim
+    # if tl, br does not form a nondegenerate squre, then the corr. element in the `prod` would be 0
+    en = (tl_intersect < br_intersect).type(tl_intersect.type()).prod(dim=2)  # shape `(N,K,2)` ---> shape `(N,K)`
+
+    area_intersect = torch.prod(br_intersect - tl_intersect, 2) * en  # * ((tl < br).all())
+    area_union = (area_a[:, np.newaxis] + area_b - area_intersect)
+
+    iou = _true_divide(area_intersect, area_union)
+
+    if iou_type.lower() == 'iou':
+        return iou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        ) # of shape `(N,K,2)`
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, 2:]
+        )
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] - bboxes_b[:, 2:] / 2
+        )
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] + bboxes_b[:, 2:] / 2
+        )
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        )
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, :2] + bboxes_b[:, 2:]
+        )
+    
+    # c for covering, of shape `(N,K,2)`
+    # the last dim is box width, box hight
+    bboxes_c = br_union - tl_union
+
+    area_covering = torch.prod(bboxes_c, 2)  # shape `(N,K)`
+
+    giou = iou - _true_divide(area_covering - area_union, area_covering)
+
+    if iou_type.lower() == 'giou':
+        return giou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        centre_a = (bboxes_a[..., 2 :] + bboxes_a[..., : 2]) / 2
+        centre_b = (bboxes_b[..., 2 :] + bboxes_b[..., : 2]) / 2
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        centre_a = bboxes_a[..., : 2]
+        centre_b = bboxes_b[..., : 2]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        centre_a = bboxes_a[..., 2 :] + bboxes_a[..., : 2]/2
+        centre_b = bboxes_b[..., 2 :] + bboxes_b[..., : 2]/2
+
+    centre_dist = torch.norm(centre_a[:, np.newaxis] - centre_b, p='fro', dim=2)
+    diag_len = torch.norm(bboxes_c, p='fro', dim=2)
+
+    diou = iou - _true_divide(centre_dist.pow(2), diag_len.pow(2))
+
+    if iou_type.lower() == 'diou':
+        return diou
+
+    """ the legacy custom cosine similarity:
+
+    # bb_a of shape `(N,2)`, bb_b of shape `(K,2)`
+    v = torch.einsum('nm,km->nk', bb_a, bb_b)
+    v = _true_divide(v, (torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)))
+    # avoid nan for torch.acos near \pm 1
+    # https://github.com/pytorch/pytorch/issues/8069
+    eps = 1e-7
+    v = torch.clamp(v, -1+eps, 1-eps)
+    """
+    v = F.cosine_similarity(bb_a[:,np.newaxis,:], bb_b, dim=-1)
+    v = (_true_divide(2*torch.acos(v), np.pi)).pow(2)
+    with torch.no_grad():
+        alpha = (_true_divide(v, 1-iou+v)) * ((iou>=0.5).type(iou.type()))
+
+    ciou = diou - alpha * v
+
+    if iou_type.lower() == 'ciou':
+        return ciou
+
+
+def bboxes_giou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'giou')
+
+
+def bboxes_diou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'diou')
+
+
+def bboxes_ciou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'ciou')
--- a/tool/utils_iou_test.py
+++ b/tool/utils_iou_test.py
+# -*- coding: utf-8 -*-
+'''
+
+'''
+import torch
+import os, sys
+from torch.nn import functional as F
+from easydict import EasyDict as ED
+
+import numpy as np
+from packaging import version
+
+
+if version.parse(torch.__version__) >= version.parse('1.5.0'):
+    def _true_divide(dividend, divisor):
+        return torch.true_divide(dividend, divisor)
+else:
+    def _true_divide(dividend, divisor):
+        return dividend / divisor
+
+
+def bboxes_iou_test(bboxes_a, bboxes_b, fmt='voc', iou_type='iou'):
+    """
+    test function for the bboxes_iou function in `train_acne.py`,
+    with message printing and plot
+    """
+    if 'plt' not in dir():
+        import matplotlib.pyplot as plt
+    if 'cv2' not in dir():
+        try:
+            import cv2
+        except ModuleNotFoundError:
+            cv2 = None
+            from PIL import Image, ImageDraw
+    
+    assert iou_type.lower() in ['iou', 'giou', 'diou', 'ciou']
+
+    if isinstance(bboxes_a, np.ndarray):
+        bboxes_a = torch.Tensor(bboxes_a)
+    if isinstance(bboxes_b, np.ndarray):
+        bboxes_b = torch.Tensor(bboxes_b)
+    
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+    # if N, K all equal 1, then plot
+
+    # top left
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_intersect = torch.max(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2]) # of shape `(N,K,2)`
+        # bottom right
+        br_intersect = torch.min(bboxes_a[:, np.newaxis, 2:], bboxes_b[:, 2:])
+        bb_a = bboxes_a[:, 2:] - bboxes_a[:, :2]  # w, h
+        bb_b = bboxes_b[:, 2:] - bboxes_b[:, :2]  # w, h
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        tl_intersect = torch.max((bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br_intersect = torch.min((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_intersect = torch.max(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2])
+        # bottom right
+        br_intersect = torch.min((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:]),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:]))
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+
+    area_a = torch.prod(bb_a, 1)
+    area_b = torch.prod(bb_b, 1)
+
+    # torch.prod(input, dim, keepdim=False, dtype=None) → Tensor
+    # Returns the product of each row of the input tensor in the given dimension dim
+    # if tl, br does not form a nondegenerate squre, then the corr. element in the `prod` would be 0
+    en = (tl_intersect < br_intersect).type(tl_intersect.type()).prod(dim=2)  # shape `(N,K,2)` ---> shape `(N,K)`
+
+    area_intersect = torch.prod(br_intersect - tl_intersect, 2) * en  # * ((tl < br).all())
+    area_union = (area_a[:, np.newaxis] + area_b - area_intersect)
+
+    iou = _true_divide(area_intersect, area_union)
+
+    # if iou_type.lower() == 'iou':
+    #     return iou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_union = torch.min(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2]) # of shape `(N,K,2)`
+        # bottom right
+        br_union = torch.max(bboxes_a[:, np.newaxis, 2:], bboxes_b[:, 2:])
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        tl_union = torch.min((bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br_union = torch.max((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_union = torch.min(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2])
+        # bottom right
+        br_union = torch.max((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:]),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:]))
+    
+    # c for covering, of shape `(N,K,2)`
+    # the last dim is box width, box hight
+    bboxes_c = br_union - tl_union
+
+    area_covering = torch.prod(bboxes_c, 2)  # shape `(N,K)`
+
+    giou = iou - (area_covering - area_union) / area_covering
+
+    print(f"tl_union.shape = {tl_union.shape}")
+    print(f"br_union.shape = {br_union.shape}")
+    print(f"bboxes_c.shape = {bboxes_c.shape}")
+
+    # if iou_type.lower() == 'giou':
+    #     return giou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        centre_a = (bboxes_a[..., 2 :] + bboxes_a[..., : 2]) / 2
+        centre_b = (bboxes_b[..., 2 :] + bboxes_b[..., : 2]) / 2
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        centre_a = (bboxes_a[..., : 2] + bboxes_a[..., 2 :]) / 2
+        centre_b = (bboxes_b[..., : 2] + bboxes_b[..., 2 :]) / 2
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        centre_a = bboxes_a[..., 2 :] + bboxes_a[..., : 2]/2
+        centre_b = bboxes_b[..., 2 :] + bboxes_b[..., : 2]/2
+
+    centre_dist = torch.norm(centre_a[:, np.newaxis] - centre_b, p='fro', dim=2)
+    diag_len = torch.norm(bboxes_c, p='fro', dim=2)
+
+    diou = iou - centre_dist.pow(2) / diag_len.pow(2)
+
+    # if iou_type.lower() == 'diou':
+    #     return diou
+
+    """ the legacy custom cosine similarity:
+
+    # bb_a of shape `(N,2)`, bb_b of shape `(K,2)`
+    v = torch.einsum('nm,km->nk', bb_a, bb_b)
+    v = _true_divide(v, (torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)))
+    # avoid nan for torch.acos near \pm 1
+    # https://github.com/pytorch/pytorch/issues/8069
+    eps = 1e-7
+    v = torch.clamp(v, -1+eps, 1-eps)
+    """
+    v = F.cosine_similarity(bb_a[:,np.newaxis,:], bb_b, dim=-1)
+    v = (_true_divide(2*torch.acos(v), np.pi)).pow(2)
+    alpha = (_true_divide(v, 1-iou+v))*((iou>=0.5).type(iou.type()))
+
+    ciou = diou - alpha * v
+
+    if N==K==1:
+        print("\n"+"*"*50)
+        print(f"bboxes_a = {bboxes_a}")
+        print(f"bboxes_b = {bboxes_b}")
+
+        print(f"area_a = {area_a}")
+        print(f"area_b = {area_b}")
+
+        print(f"area_intersect = {area_intersect}")
+        print(f"area_union = {area_union}")
+
+        print(f"tl_intersect = {tl_intersect}")
+        print(f"br_intersect = {br_intersect}")
+        print(f"tl_union = {tl_union}")
+        print(f"br_union = {br_union}")
+
+        print(f"area_covering (area of bboxes_c) = {area_covering}")
+        
+        print(f"centre_dist = {centre_dist}")
+        print(f"diag_len = {diag_len}")
+
+        print("for computing ciou")
+        inner_product = torch.einsum('nm,km->nk', bb_a, bb_b)
+        product_of_lengths = torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)
+        print(f"inner product of bb_a and bb_b is {inner_product}")
+        print(f"product of lengths of bb_a and bb_b is {product_of_lengths}")
+        print(f"inner product divided by product of lengths equals {_true_divide(inner_product, product_of_lengths)}")
+        print(f"normalized angle distance = {v}")
+        print(f"alpha = {alpha}")
+        print(f"v = {v}")
+        print(f"alpha = {alpha}")
+
+        bc = ED({"xmin":tl_union.numpy().astype(int)[0][0][0], "ymin":tl_union.numpy().astype(int)[0][0][1], "xmax":br_union.numpy().astype(int)[0][0][0], "ymax":br_union.numpy().astype(int)[0][0][1]})
+        adjust_x = bc.xmin - int(0.25*(bc.xmax-bc.xmin))
+        adjust_y = bc.ymin - int(0.25*(bc.ymax-bc.ymin))
+
+        print(f"adjust_x = {adjust_x}")
+        print(f"adjust_y = {adjust_y}")
+
+        bc.xmin, bc.ymin, bc.xmax, bc.ymax = bc.xmin-adjust_x, bc.ymin-adjust_y, bc.xmax-adjust_x, bc.ymax-adjust_y
+        
+        ba, bb = bboxes_a.numpy().astype(int)[0], bboxes_b.numpy().astype(int)[0]
+        if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+            ba = ED({"xmin":ba[0]-adjust_x, "ymin":ba[1]-adjust_y, "xmax":ba[2]-adjust_x, "ymax":ba[3]-adjust_y})
+            bb = ED({"xmin":bb[0]-adjust_x, "ymin":bb[1]-adjust_y, "xmax":bb[2]-adjust_x, "ymax":bb[3]-adjust_y})
+        elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+            ba = ED({"xmin":ba[0]-ba[2]//2-adjust_x, "ymin":ba[1]-ba[3]//2-adjust_y, "xmax":ba[0]+ba[2]//2-adjust_x, "ymax":ba[1]+ba[3]//2-adjust_y})
+            bb = ED({"xmin":bb[0]-bb[2]//2-adjust_x, "ymin":bb[1]-bb[3]//2-adjust_y, "xmax":bb[0]+bb[2]//2-adjust_x, "ymax":bb[1]+bb[3]//2-adjust_y})
+        elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+            ba = ED({"xmin":ba[0]-adjust_x, "ymin":ba[1]-adjust_y, "xmax":ba[0]+ba[2]-adjust_x, "ymax":ba[1]+ba[3]-adjust_y})
+            bb = ED({"xmin":bb[0]-adjust_x, "ymin":bb[1]-adjust_y, "xmax":bb[0]+bb[2]-adjust_x, "ymax":bb[1]+bb[3]-adjust_y})
+
+        print(f"ba = {ba}")
+        print(f"bb = {bb}")
+        print(f"bc = {bc}")
+
+        plane = np.full(shape=(int(1.5*(bc.ymax-bc.ymin)),int(1.5*(bc.xmax-bc.xmin)),3), fill_value=255, dtype=np.uint8)
+        img_with_boxes = plane.copy()
+
+        line_size = 1
+        if cv2:
+            cv2.rectangle(img_with_boxes, (ba.xmin, ba.ymin), (ba.xmax, ba.ymax), (0, 255, 0), line_size)
+            cv2.rectangle(img_with_boxes, (bb.xmin, bb.ymin), (bb.xmax, bb.ymax), (0, 0, 255), line_size)
+            cv2.rectangle(img_with_boxes, (max(0,bc.ymin-1), max(0,bc.ymin-1)), (bc.xmax, bc.ymax), (255, 0, 0), line_size)
+        else:
+            img_with_boxes = Image.fromarray(img_with_boxes)
+            drawer = ImageDraw.Draw(img_with_boxes)
+            # drawer.line([(ba.xmin, ba.ymin), (ba.xmin, ba.ymax), (ba.xmax, ba.ymax), (ba.xmax, ba.ymin), (ba.xmin, ba.ymin)], fill='green', width=line_size)
+            # drawer.line([(bb.xmin, bb.ymin), (bb.xmin, bb.ymax), (bb.xmax, bb.ymax), (bb.xmax, bb.ymin), (bb.xmin, bb.ymin)], fill='blue', width=line_size)
+            # drawer.line([((max(0,bc.xmin-1), max(0,bc.ymin-1)), ((max(0,bc.xmin-1), bc.ymax), (bc.xmax, bc.ymax), (bc.xmax, max(0,bc.ymin-1)), ((max(0,bc.xmin-1), max(0,bc.ymin-1))], fill='red', width=line_size)
+            drawer.rectangle([(ba.xmin, ba.ymin), (ba.xmax, ba.ymax)], outline='green', width=line_size)
+            drawer.rectangle([(bb.xmin, bb.ymin), (bb.xmax, bb.ymax)], outline='blue', width=line_size)
+            drawer.rectangle([(max(0,bc.xmin-1), max(0,bc.ymin-1)), (bc.xmax+1, bc.ymax+1)], outline='red', width=line_size)
+            img_with_boxes = np.array(img_with_boxes)
+            del drawer
+
+        plt.figure(figsize=(7,7))
+        plt.imshow(img_with_boxes)
+        plt.show()
+
+        print(f"iou = {iou}")
+        print(f"giou = {giou}")
+        print(f"diou = {diou}")
+        print(f"ciou = {ciou}")
+
+    if iou_type.lower() == 'ciou':
+        return ciou
+    elif iou_type.lower() == 'diou':
+        return diou
+    elif iou_type.lower() == 'giou':
+        return giou
+    elif iou_type.lower() == 'iou':
+        return iou
+
+
+def original_iou_test(bboxes_a, bboxes_b, xyxy=True):
+    """
+    test function for the original iou function in `train.py`
+    """
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if isinstance(bboxes_a, np.ndarray):
+        bboxes_a = torch.Tensor(bboxes_a)
+    if isinstance(bboxes_b, np.ndarray):
+        bboxes_b = torch.Tensor(bboxes_a)
+    
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+    # if N, K all equal 1, then plot
+    
+    # top left
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        # bottom right
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+
+    print(f"tl.shape = {tl.shape}")
+    print(f"br.shape = {br.shape}")
+    print(f"area_a.shape = {area_a.shape}")
+    print(f"area_b.shape = {area_b.shape}")
+    print(f"en.shape = {en.shape}")
+    print(f"area_i.shape = {area_i.shape}")
+
+    if N == K == 1:
+        pass
+
+    return area_i / (area_a[:, None] + area_b - area_i)
--- a/tool/yolo_layer.py
+++ b/tool/yolo_layer.py
+import torch.nn as nn
+import torch.nn.functional as F
+from tool.torch_utils import *
+
+def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    batch = output.size(0)
+    H = output.size(2)
+    W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.view(batch, num_anchors * H * W)
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.view(batch, num_anchors, num_classes, H * W)
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    cuda_check = output.is_cuda
+    if cuda_check:
+        device = output.get_device()
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= W
+    by_bh /= H
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
+    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
+    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
+    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.view(batch, num_anchors * H * W, 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+
+def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    # batch = output.size(0)
+    # H = output.size(2)
+    # W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    cuda_check = output.is_cuda
+    if cuda_check:
+        device = output.get_device()
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= output.size(3)
+    by_bh /= output.size(2)
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    by = by_bh[:, :num_anchors].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bw = bx_bw[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bh = by_bh[:, num_anchors:].view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+class YoloLayer(nn.Module):
+    ''' Yolo layer
+    model_out: while inference,is post-processing inside or outside the model
+        true:outside
+    '''
+    def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False):
+        super(YoloLayer, self).__init__()
+        self.anchor_mask = anchor_mask
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) // num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.stride = stride
+        self.seen = 0
+        self.scale_x_y = 1
+
+        self.model_out = model_out
+
+    def forward(self, output, target=None):
+        if self.training:
+            return output
+        masked_anchors = []
+        for m in self.anchor_mask:
+            masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
+        masked_anchors = [anchor / self.stride for anchor in masked_anchors]
+
+        return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
+