PyTorch框架的Yolov5移植–寒武纪开发者社区|离线模型和在线模型_在线学习

本文对开源yolov5s模型进行寒武纪平台的移植

整个移植过程分为模型结构转换、添加后处理算子框架代码、模型量化、在线推理和离线推理共五个步骤。

对于原始Yolov5网络的后处理部分的逻辑，Cambricon-PyTorch直接使用一个大的BANGC算子完成后处理的计算，需要对原生的pytorch网络进行修改，将后处理部分的整体计算换成BANGC算子。

具体做法为是将yololayer层替换成了yolov5_detection_output，把三个yololayer的输入传给了yolov5_detection_output。修改部分在yolo.py中，如下：

ifx[0].device.type=='mlu':

foriinrange(self.nl):

x[i]=self.m[i](x[i])#conv

y=x[i].sigmoid()

output.append(y)

detect_out=torch.ops.torch_mlu.yolov5_detection_output(output[0],output[1],output[2],

self.anchors_list,self.nc,self.num_anchors,

self.img_h,self.img_w,self.conf_thres,self.iou_thres,self.maxBoxNum)

returndetect_out

ifx[0].device.type=='cpu':

z=[]

bs,_,ny,nx=x[i].shape#x(bs,255,20,20)tox(bs,3,20,20,85)

x[i]=x[i].view(bs,self.na,self.no,ny,nx).permute(0,1,3,4,2).contiguous()

ifnotself.training:#inference

ifself.grid[i].shape[2:4]!=x[i].shape[2:4]:

self.grid[i]=self._make_grid(nx,ny).to(x[i].device)

y[...,0:2]=(y[...,0:2]*2.-0.5+self.grid[i].to(x[i].device))*self.stride[i]#xy

y[...,2:4]=(y[...,2:4]*2)**2*self.anchor_grid[i]#wh

z.append(y.view(bs,-1,self.no))

returnxifself.trainingelsetorch.cat(z,1)

在第一步中我们使用了BANGC编写的Yolov5Detection算子替换掉了原始的后处理逻辑。为了保证能够正确调用到这个算子，需要将该Yolov5Detection算子集成到框架中。

共分成两步：先将算子集成到CNPlugin中，然后将CNPlugin算子集成到Cambricon-Pytorch。该算子的实现与集成到CNPlugin会在另一个教程中详细介绍，在这里只介绍将该算子集成到CambriconPytorch这一步骤。

-name:yolov5_detection_output//算子名称

use_mlu_dispatcher:custom//分发类型，unboxed_only为标准化算子，custom为客制化算子

derived_type:cnml//派生类型

schema_string:torch_mlu::yolov5_detection_output//用于算子注册

arguments://参数

-name:alpha_data//参数名称

type:constat::Tensor&//参数类型

-name:beta_data

type:constat::Tensor&

-name:gamma_data

-name:anchor_data

type:torch::List

-name:num_classes

type:int64_t

-name:num_anchors

-name:img_height

-name:img_width

-name:conf_thres

type:double

-name:nms_thres

-name:maxBoxNum

type:int64_treturn_type:at::Tensor//函数返回类型

2）添加OpMethods基类中的CPU实现。

//op_methods.h

virtualat::Tensoryolov5_detection_output(constat::Tensor&alpha_data,constat::Tensor&beta_data,constat::Tensor&gamma_data,torch::Listanchor_data,int64_tnum_classes,int64_tnum_anchors,int64_timg_height,int64_timg_width,doubleconf_thres,doublenms_thres,int64_tmaxBoxNum);

//op_methods.cpp因为在这里并没有添加CPU实现，直接抛出异常

at::TensorOpMethods::yolov5_detection_output(

constat::Tensor&alpha_data,constat::Tensor&beta_data,

constat::Tensor&gamma_data,torch::Listanchor_data,

int64_tnum_classes,int64_tnum_anchors,int64_timg_height,int64_timg_width,

doubleconf_thres,doublenms_thres,int64_tmaxBoxNum){

//Todo:cpukernel

throwstd::invalid_argument("TodoforCPU");

3）添加wrapper

推理算子或训练算子会优先分发到wrapper中。wrapper是对算子kernel的封装，每个算子对应一个wrapper。根据模板生成的wrapper头文件cnml_kernel.h，添加wrapper的实现。

//cnml_kernel.h

at::Tensorcnml_yolov5_detection_output(constat::Tensor&alpha_data,constat::Tensor&beta_data,constat::Tensor&gamma_data,torch::Listanchor_data,int64_tnum_classes,int64_tnum_anchors,int64_timg_height,int64_timg_width,doubleconf_thres,doublenms_thres,int64_tmaxBoxNum);

//yolov5_detection_output.cpp

at::Tensorcnml_yolov5_detection_output(constat::Tensor&alpha_data,

constat::Tensor&beta_data,

constat::Tensor&gamma_data,

torch::Listanchor_data,

int64_tnum_classes,

int64_tnum_anchors,

int64_timg_height,

int64_timg_width,

doubleconf_thres,

doublenms_thres,

int64_tmaxBoxNum){

autoalpha_new=alpha_data;

autobeta_new=beta_data;

autogamma_new=gamma_data;

boolcast_fp32_fp16=(toCnmlDataType(alpha_data.dtype())==CNML_DATA_FLOAT32);

if(cast_fp32_fp16){

alpha_new=cnml_cast_internal(alpha_new,CNML_CAST_FLOAT32_TO_FLOAT16);

beta_new=cnml_cast_internal(beta_new,CNML_CAST_FLOAT32_TO_FLOAT16);

gamma_new=cnml_cast_internal(gamma_new,CNML_CAST_FLOAT32_TO_FLOAT16);

}

returncnml_yolov5_detection_output_internal(alpha_new,

beta_new,

gamma_new,

anchor_data,

num_classes,

num_anchors,

img_height,

img_width,

conf_thres,

nms_thres,

maxBoxNum);

4）添加kernel

Wrapper中通过调用kernel实现算子功能。算子的具体实现主要通过调用CNML库来完成。以下是CNML库的简要逻辑。

//cnml_internal.h

at::Tensorcnml_yolov5_detection_output_internal(constat::Tensor&alpha_data,

int64_tmaxBoxNum);

//yolov5_detection_output_internal.cpp

intbatch_size=alpha_data.size(0);

intinputNum=3;

intoutput_num=2;

intmaskGroupNum=3;

intclassNum=num_classes;

intmaxbox_num=maxBoxNum;

intnet_w=img_width;

intnet_h=img_height;

floatconfidence_thresh=static_cast(conf_thres);

floatnms_thresh=static_cast(nms_thres);

cnmlTensor_tcnml_input_ptr[3];

cnmlTensor_tcnml_output_ptr[2];

//prepareinputcnmltensor

auto*alpha_impl=getMluTensorImpl(alpha_data);

autoalpha_cnml=alpha_impl->CreateCnmlTensor(CNML_TENSOR,

toCnmlDataType(alpha_data.dtype()));

auto*beta_impl=getMluTensorImpl(beta_data);

autobeta_cnml=beta_impl->CreateCnmlTensor(CNML_TENSOR,

toCnmlDataType(beta_data.dtype()));

auto*gamma_impl=getMluTensorImpl(gamma_data);

autogamma_cnml=gamma_impl->CreateCnmlTensor(CNML_TENSOR,

toCnmlDataType(gamma_data.dtype()));

autooutput=at::empty({batch_size,maxbox_num*7+64,1,1},

alpha_data.options());

auto*output_impl=getMluTensorImpl(output);

autooutput_cnml=output_impl->CreateCnmlTensor(CNML_TENSOR,

toCnmlDataType(output.dtype()));

//prepareinputcnmltensorformulticore

intbuf_size=1024*(alpha_data.size(2)*alpha_data.size(3)+

beta_data.size(2)*beta_data.size(3)+

gamma_data.size(2)*gamma_data.size(3));

autotemp_buf=at::empty({batch_size,buf_size,1,1},alpha_data.options());

auto*temp_buf_impl=getMluTensorImpl(temp_buf);

autotemp_buf_cnml=temp_buf_impl->CreateCnmlTensor(CNML_TENSOR,

toCnmlDataType(temp_buf.dtype()));

//EndtheexecutionflowifnotMLUdevice

CHECK_MLU_DEVICE(output);

cnml_input_ptr[0]=alpha_cnml;

cnml_input_ptr[1]=beta_cnml;

cnml_input_ptr[2]=gamma_cnml;

cnml_output_ptr[0]=output_cnml;

cnml_output_ptr[1]=temp_buf_cnml;

//prepareh_arr

std::vectorh_arr_data(64,1);

h_arr_data[0]=alpha_data.size(2);

h_arr_data[1]=beta_data.size(2);

h_arr_data[2]=gamma_data.size(2);

inth_data[]={h_arr_data[0],h_arr_data[1],h_arr_data[2]};

//preparew_arr

std::vectorw_arr_data(64,1);

w_arr_data[0]=alpha_data.size(3);

w_arr_data[1]=beta_data.size(3);

w_arr_data[2]=gamma_data.size(3);

intw_data[]={w_arr_data[0],w_arr_data[1],w_arr_data[2]};

//preparebias_arr

std::vectorbias_arr_data(64,1.0);

floatbias_data[64];

for(inti=0;i

bias_arr_data[i]=(float)anchor_data[i];

bias_data[i]=bias_arr_data[i];

cnmlPluginYolov5DetectionOutputOpParam_tYolov5params;

TORCH_CNML_CHECK(cnmlCreatePluginYolov5DetectionOutputOpParam(&Yolov5params,

batch_size,

inputNum,

classNum,

maskGroupNum,

maxbox_num,

net_w,

net_h,

confidence_thresh,

nms_thresh,

GET_CORE_VERSION,

w_data,

h_data,

bias_data));

cnmlBaseOp_tyolov5_op;

TORCH_CNML_CHECK(cnmlCreatePluginYolov5DetectionOutputOp(&yolov5_op,

Yolov5params,

cnml_input_ptr,

cnml_output_ptr));

//returntoJITifrunningmodeisfuse

CHECK_RETURN_TO_FUSE(yolov5_op,output);

//getqueueandfunc_param

cnrtInvokeFuncParam_tfunc_param;

staticu32_taffinity=0x01;

intdata_parallelism=1;

func_param.affinity=&affinity;

func_param.data_parallelism=&data_parallelism;

func_param.end=CNRT_PARAM_END;

autoqueue=getCurQueue();

//compileallops

TORCH_CNML_CHECK(cnmlCompileBaseOp(yolov5_op,

GET_CORE_NUMBER));

void*input_addrs[3];

void*output_addrs[2];

input_addrs[0]=alpha_impl->raw_mutable_data();

input_addrs[1]=beta_impl->raw_mutable_data();

input_addrs[2]=gamma_impl->raw_mutable_data();

output_addrs[0]=output_impl->raw_mutable_data();

output_addrs[1]=temp_buf_impl->raw_mutable_data();

//computeoperator

TORCH_CNML_CHECK(cnmlComputePluginYolov5DetectionOutputOpForward(yolov5_op,

input_addrs,

output_addrs,

&func_param,

queue));

syncQueue(queue);

TORCH_CNML_CHECK(cnmlDestroyPluginYolov5DetectionOutputOpParam(&Yolov5params));

TORCH_CNML_CHECK(cnmlDestroyBaseOp(&yolov5_op));

returnoutput;

5)重新编译Cambricon-Pytorch

上述步骤操作完，重新编译CambriconPytorch,进入python环境确认是否集成成功，如下：

Python3.5.2(default,Nov122018,13:43:14)

[GCC5.4.020160609]onlinux

Type"help","copyright","credits"or"license"formoreinformation.

>>>importtorch

>>>importtorch_mlu

CNML:7.7.0a414883

CNRT:4.6.0e158c88

>>>torch.ops.torch_mlu.yolov5_detection_output

为什么要量化：量化是将float32的模型转换为int8/int16的模型，可以保证计算精度在目标误差范围内的情况下，显著减少模型占用的存储空间和带宽，加速推理；比如int8模型是指将数值以有符号8位整型数据保存，并提供int8定点数的指数position和缩放因子scale，因此int8模型中每个8位整数i表示的实际值为：value=(i*2^position)/scale。设备在进行在线推理和生成离线模型时仅支持输入量化后的模型。

参数含义：

其中qconfig_spec包括：{‘iteration’:1,‘use_avg’:False,‘data_scale’:1.0,‘mean’:[0,0,0],‘std’:[1,1,1],‘firstconv’:True,‘per_channel’:False}

参考代码：

parser=argparse.ArgumentParser()

parser.add_argument('--cfg',type=str,default='yolov5s.yaml',help='model.yaml')

parser.add_argument('--device',default='cpu',help='cudadevice,i.e.0or0,1,2,3orcpu')

opt=parser.parse_args()

#获取yolov5网络文件

net=yolo.get_model(opt)

#在这里设置firstconv参数为False,因为该模型首层为focus算子，非卷积，无法开启first_conv

qconfig={'iteration':1,'use_avg':False,'data_scale':1.0,'firstconv':False,'per_channel':False}

#调用量化接口

quantized_net=mlu_quantize.quantize_dynamic_mlu(net.float(),qconfig_spec=qconfig,dtype='int8',gen_quant=True)

#设置为推理模式

quantized_net=quantized_net.eval().float()

#读取图片做预处理

img_mat=Image.open("./images/image.jpg")

ifimg_mat.mode!='RGB':

img_mat=img_mat.convert('RGB')

crop=640

resize=640

transform=transforms.Compose([

transforms.Resize(resize),

transforms.CenterCrop(crop),

transforms.ToTensor(),

])

img=transform(img_mat)

im_tensor=torch.unsqueeze(img,0)

im_tensor=im_tensor.float()

#推理生成量化值

quantized_net(im_tensor)

#保存量化后的模型

torch.save(quantized_net.state_dict(),'./yolov5s_int8.pt')

操作步骤：对步骤2生成的量化后的yolov5s_int8.pt进行在线推理测试。对图片进行推理，画出目标框和标注置信度。在yolov5_pytorch_demo/quantize_online目录中，示例如下：

1)逐层模式pythondetect.py

推理后的图片存储在./results目录下

推理过程：

融合模式：被融合的多个层作为单独的运算（单个Kernel）在MLU上运。根据络中的层是否可以被融合，络被拆分为若个络段。MLU与CPU间的数据拷只在各个络之间发。

逐层模式：逐层模式中，每层的操作都作为单独的运算（单个Kernel）在MLU上运，可以将每层结果导出到CPU上，便进调试。

一般来说，在线逐层模式更适用于调试环节，在线融合模式可以查看网络融合情况；

主要步骤：

1）设置：torch.set_grad_enabled(False)#注意：在运行MLU推理融合模式时，这个条件是必须要设置的。

2）获取模型加载权重：

4）如果要运行在线融合模式，需要在运行前向过程前调用jit.trace()接口生成静态图。首先会对整个网络运行一遍逐层模式，同时构建一个静态图；然后对静态图进行优化（包括去除冗余算子、小算子融、数据块复用等）得到一个优化后的静态图；之后会根据输入数据的设备类型进行基于设备的优化，生成针对当前设备的指令：

5）最后根据推理结果为图片加框和标记置信度

parser.add_argument('--jit',type=bool,help='fusion',default=False)

parser.add_argument('--save',type=bool,default=False,help='selectionofsave*.cambrcion')

#获取yolov5网络并加载量化后的权重

net=yolo.get_empty_model(opt)

quantized_net=torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(net)

state_dict=torch.load('yolov5s_int8.pt')

quantized_net.load_state_dict(state_dict,strict=False)

#转移到MLU上进行推理

device=ct.mlu_device()

quantized_net.to(ct.mlu_device())

#读取图片

img_mat=cv2.imread('images/image.jpg')

#调用预处理函数做预处理

img=letter_box(img_mat)

#设置融合模式，save选项表示是否生成离线模型，因为在进行在线融合推理时，可以生成离线模型

ifopt.jit:

ifopt.save:

ct.save_as_cambricon('yolov5s')

torch.set_grad_enabled(False)

ct.set_core_number(4)

trace_input=torch.randn(1,3,640,640,dtype=torch.float)

trace_input=trace_input.to(ct.mlu_device())

quantized_net=torch.jit.trace(quantized_net,trace_input,check_trace=False)

#推理

detect_out=quantized_net(img.to(ct.mlu_device()))

#关闭生成离线模型

ct.save_as_cambricon("")

detect_out=detect_out.to(torch.device('cpu'))

#为原图添加框、检测类别和置信度

box_result=get_boxes(detect_out)

draw_boxes(box_result)

注意：原始的yolov5网络可以对输入的图片做自适应的预处理，使预处理后的图片可以有不同的大小。而在当前demo中，为了能够满足在线融合推理和离线推理必须是固定大小的要求，将预处理都改成了固定大小为640*640。

1）生成离线模型

操作步骤：

生成离线模型与在线推理代码相似，在yolov5_pytorch_demo/quantize_online目录中，示例如下：

pythondetect.py--jitTrue--saveTrue

会在当前目录生成离线模型yolov5s.cambricon和离线模型信息文件yolov5s.cambricon_twins

2）离线推理

对一张图片进行离线推理，画出目标框和置信度。

示例图片放置在yolov5_pytorch_demo/offline/yolov5_offline_simple_demo/data目录下，离线模型放置在model目录下。执行make.sh在src目录下生成可执行文件，执行run.sh对一张图片进行推理，在result目录下生成推理后的图片。

THE END

PyTorch框架的Yolov5移植–寒武纪开发者社区

必知！5大AI生成模型算法序列神经网络ai生成模型

基于对照经验公式的构件本构模型离线更新混合试验方法

风储联合电站实时自调度的高效深度确定性策略梯度算法

PyTorch框架的Yolov5移植–寒武纪开发者社区

在线学习和离线学习淼淼兮予怀

西北工业大学离线具身智能研究取得新突破