0%

docker build

https://docs.docker.com/engine/reference/builder/

install docker

1
2
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo

ubuntu docker

https://askubuntu.com/questions/1140183/install-gcc-9-on-ubuntu-18-04

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
FROM ubuntu:16.04

LABEL com.zhangjun.image.authors="ewalker.zj@gmail.com"

ENV TZ "Asia/Shanghai"

RUN apt update && \
apt -qqy install software-properties-common && \
add-apt-repository -y ppa:ubuntu-toolchain-r/test && \
add-apt-repository -y ppa:deadsnakes/ppa && \
apt update && \
apt -qqy install gcc-9 g++-9 && \
apt -qqy install python3.7 && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.7 10 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
wget https://bootstrap.pypa.io/get-pip.py && \
python3.7 get-pip.py && \
python3.7 -m pip install pre-commit && \
apt-get -qqy clean && \
rm -rf get-pip.py && rm -rf /var/lib/apt/lists/*

# update-alternatives --config gcc
# update-alternatives --config python

macOS python load Framework

https://docs.python.org/3/library/ctypes.html

1
2
3
4
5
6
7
8
9
10
11
12
from ctypes.util import find_library
from ctypes import cdll
import os

print(os.name)
metal_library = find_library("Metal")
core_graphics_library = find_library("CoreGraphics")
mps_library = find_library("MetalPerformanceShaders")

print(cdll.LoadLibrary(metal_library))
print(cdll.LoadLibrary(core_graphics_library))
print(cdll.LoadLibrary(mps_library))

core

context

1
2
3
4
5
6
7
8
9
10
11
12
13
class KernelContext {
public:
template <typename ContextT>
ContextT& As() {
if (!ctx_.valid()) {
ctx_.set<ContextT>();
}
return *ctx_.get_mutable<ContextT>();
}

private:
Any ctx_;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class ContextScheduler {
public:
static ContextScheduler& Global() {
static auto* x = new ContextScheduler;
return *x;
}
std::unique_ptr<KernelContext> NewContext(
TargetType target,
/*only used for cuda context*/ int exec_stream_id = 0) {
std::unique_ptr<KernelContext> ctx(new KernelContext);
switch (target) {
case TARGET(kHost):
kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
&ctx->As<HostContext>());
break;
}
return ctx;
}
private:
template <TargetType Type, typename ContextT>
void InitContext() {
kernel_contexts_[Type].As<ContextT>().InitOnce();
}
ContextScheduler() {
InitContext<TargetType::kHost, HostContext>();
}
private:
std::map<TargetType, KernelContext> kernel_contexts_;
};

op lite

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class OpLite : public Registry {
public:
OpLite() = default;
explicit OpLite(const std::string &type) : op_type_(type) {}
explicit OpLite(const std::vector<Place> &valid_places)
: valid_places_(valid_places) {}

void SetValidPlaces(const std::vector<Place> &places) {
VLOG(5) << "valid places " << valid_places_.size();
valid_places_ = places;
}
virtual bool Run();
// Indicate whether the Op runs only once or not
virtual bool run_once() const { return false; }
std::string Type() const { return op_type_; }

// Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);

template <typename T>
inline void AttachParam(T *param) {
op_param_ = static_cast<T *>(param);
}
// Create all the kernels for the valid targets.
std::vector<std::unique_ptr<KernelBase>> CreateKernels(
const std::vector<Place> &places, const std::string &kernel_type = "");

Scope *scope() { return scope_; }

// Assign op param to kernel.
virtual void AttachKernel(KernelBase *kernel) = 0;
void SetKernel(std::vector<std::unique_ptr<KernelBase>> &kernels) { // NOLINT
kernel_ = std::move(kernels.front());
kernel_->SetContext(
ContextScheduler::Global().NewContext(kernel_->target()));
}

KernelBase *GetKernel() { // NOLINT
return kernel_.get();
}
virtual ~OpLite() = default;
protected:
friend class mir::Node;
friend class mir::SSAGraph;
protected:
Scope *scope_{nullptr};
std::unique_ptr<KernelBase> kernel_;
std::string op_type_;
std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_;
// todo: it's prefered to combine last_input_shapes and
// last_input_lods into a single hash value to decrease
// memory usage.
std::vector<DDimLite> last_input_shapes{};
std::vector<std::vector<std::vector<uint64_t>>> last_input_lods{};
std::vector<DDimLite> last_output_shapes{};
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
mutable operators::ParamBase *op_param_{nullptr};

private:
// Infer Shape according to memory, if current input shapes are consistent
// with that of previous inputs, output shapes of last time will be reused.
bool InferShapeWithCache();
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
const std::vector<Place> &places, const std::string &kernel_type) {
std::vector<std::unique_ptr<KernelBase>> kernels;
CHECK(!op_type_.empty()) << "op_type_ should be set first";

auto pick_kernel = [&](const Place &place) {
auto ks = KernelRegistry::Global().Create(
op_type_, place.target, place.precision, place.layout);
VLOG(5) << "pick kernel for " << op_info()->Type() << " "
<< place.DebugString() << " get " << ks.size() << " kernels";
for (auto &&it : ks) {
AttachKernel(it.get());
kernels.emplace_back(std::move(it));
}
};

if (!kernel_type.empty()) {
Place place;
std::string op_type, alias;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
pick_kernel(place);
CHECK(!kernels.empty()) << "no kernel for kernel type " << kernel_type;
return kernels;
}

std::set<Place> expanded_places(places.begin(), places.end());
for (auto &place : places) {
// Pick kernels those support any Precision and any DataLayout, For example:
// kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny
expanded_places.insert(
Place(place.target, place.precision, DATALAYOUT(kAny)));
expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout));
expanded_places.insert(
Place(place.target, PRECISION(kAny), DATALAYOUT(kAny)));
}

std::set<TargetType> targets;
for (auto place : expanded_places) {
pick_kernel(place);
targets.insert(place.target);
}

VLOG(5) << "op " << op_type_ << " get " << kernels.size() << " kernels";
return kernels;
}
1
2
3
4
5
6
7
8
9
/*
* Operator Information, such as some description. It will be shared by all the
* kernels of the same operator.
*/
class OpInfo : public cpp::OpDesc {
public:
OpInfo(const OpInfo &) = default;
explicit OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {}
};

op registry

1
2
3
4
5
6
7
8
9
10
11
12
13
14
class OpKernelInfoCollector {
public:
static OpKernelInfoCollector& Global() {
static auto* x = new OpKernelInfoCollector;
return *x;
}
void AddOp2path(const std::string& op_name, const std::string& op_path);
void AddKernel2path(const std::string& kernel_name,
const std::string& kernel_path);

private:
std::map<std::string, std::string> op2path_;
std::map<std::string, std::string> kernel2path_;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class OpLiteFactory {
public:
// Register a function to create an op
void RegisterCreator(const std::string& op_type,
std::function<std::shared_ptr<OpLite>()> fun) {
op_registry_[op_type] = fun;
}

static OpLiteFactory& Global() {
static OpLiteFactory* x = new OpLiteFactory;
return *x;
}

std::shared_ptr<OpLite> Create(const std::string& op_type) const {
auto it = op_registry_.find(op_type);
if (it == op_registry_.end()) return nullptr;
return it->second();
}

std::string DebugString();

std::vector<std::string> GetAllOps() const {
std::vector<std::string> res;
for (const auto& op : op_registry_) {
res.push_back(op.first);
}
return res;
}

protected:
std::map<std::string, std::function<std::shared_ptr<OpLite>()>> op_registry_;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
class OpLiteRegistrar {
public:
OpLiteRegistrar(const std::string& op_type,
std::function<std::shared_ptr<OpLite>()> fun) {
OpLiteFactory::Global().RegisterCreator(op_type, fun);
}
// Touch function is used to guarantee registrar was initialized.
void touch() {}
};

class KernelFactory {
public:
// Register a function to create kernels
void RegisterCreator(const std::string& op_type,
TargetType target,
PrecisionType precision,
DataLayoutType layout,
std::function<std::unique_ptr<KernelBase>()> fun) {
op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back(
fun);
}

static KernelFactory& Global() {
static KernelFactory* x = new KernelFactory;
return *x;
}

/**
* Create all kernels belongs to an op.
*/
std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type) {
std::list<std::unique_ptr<KernelBase>> res;
if (op_registry_.find(op_type) == op_registry_.end()) return res;
auto& kernel_registry = op_registry_[op_type];
for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) {
for (auto& fun : it->second) {
res.emplace_back(fun());
}
}
return res;
}

/**
* Create a specific kernel. Return a list for API compatible.
*/
std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type,
TargetType target,
PrecisionType precision,
DataLayoutType layout) {
std::list<std::unique_ptr<KernelBase>> res;
if (op_registry_.find(op_type) == op_registry_.end()) return res;
auto& kernel_registry = op_registry_[op_type];
auto it = kernel_registry.find(std::make_tuple(target, precision, layout));
if (it == kernel_registry.end()) return res;
for (auto& fun : it->second) {
res.emplace_back(fun());
}
return res;
}

protected:
// Outer map: op -> a map of kernel.
// Inner map: kernel -> creator function.
// Each kernel was represented by a combination of <TargetType, PrecisionType,
// DataLayoutType>
std::map<std::string,
std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
std::list<std::function<std::unique_ptr<KernelBase>()>>>>
op_registry_;
};

// Register Kernel by initializing a static KernelRegistrar instance
class KernelRegistrar {
public:
KernelRegistrar(const std::string& op_type,
TargetType target,
PrecisionType precision,
DataLayoutType layout,
std::function<std::unique_ptr<KernelBase>()> fun) {
KernelFactory::Global().RegisterCreator(
op_type, target, precision, layout, fun);
}
// Touch function is used to guarantee registrar was initialized.
void touch() {}
};

class ParamTypeDummyRegistry {
public:
struct NewInstance {
NewInstance() {}
NewInstance& BindInput(const std::string& arg_name,
const ParamType& ptype) {
return *this;
}
NewInstance& BindOutput(const std::string& arg_name,
const ParamType& ptype) {
return *this;
}
NewInstance& SetVersion(const std::string& version) { return *this; }
NewInstance& BindPaddleOpVersion(const std::string& op_type,
int32_t version_id) {
return *this;
}
bool Finalize() { return true; }
};

private:
ParamTypeDummyRegistry() = default;
};

注册机制

op注册

1
2
3
4
5
6
7
8
9
10
#define REGISTER_LITE_OP(op_type__, OpClass)                                   \
static paddle::lite::OpLiteRegistrar op_type__##__registry( \
#op_type__, []() { \
return std::unique_ptr<paddle::lite::OpLite>(new OpClass(#op_type__)); \
}); \
int touch_op_##op_type__() { \
op_type__##__registry.touch(); \
OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__); \
return 0; \
}

kernel 注册

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// Register a kernel.
#define REGISTER_LITE_KERNEL( \
op_type__, target__, precision__, layout__, KernelClass, alias__) \
static paddle::lite::KernelRegistrar \
op_type__##target__##precision__##layout__##alias__##_kernel_registry( \
#op_type__, \
TARGET(target__), \
PRECISION(precision__), \
DATALAYOUT(layout__), \
[]() { \
std::unique_ptr<KernelClass> x(new KernelClass); \
x->set_op_type(#op_type__); \
x->set_alias(#alias__); \
return x; \
}); \
int touch_##op_type__##target__##precision__##layout__##alias__() { \
op_type__##target__##precision__##layout__##alias__##_kernel_registry \
.touch(); \
OpKernelInfoCollector::Global().AddKernel2path( \
#op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
__FILE__); \
return 0; \
} \
ParamTypeRegistry( \
op_type__, target__, precision__, layout__, KernelClass, alias__)

program

scope

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class Scope final {
public:
Scope()
: kids_lock_{new lite::fluid::RWLock},
vars_lock_{new lite::fluid::RWLock},
rwlock_{new lite::fluid::RWLock} {}
// delete below two functions to allow pybind to recognise it cannot make a
// copy
// link:
// https://stackoverflow.com/questions/53807248/pybind11-returning-a-pointer-to-a-container-of-unique-ptr
Scope(const Scope&) = delete;
Scope& operator=(const Scope&) = delete;
~Scope();

Scope& NewScope() const;

Variable* Var(const std::string& name);

Variable* LocalVar(const std::string& name);

Variable* FindVar(const std::string& name) const;

Variable* FindLocalVar(const std::string& name) const;

const Scope* parent() const { return parent_; }

// Get attribute params stored in parent scopes.
std::vector<std::string> AttributeVarNames() const;
// Following the legacy scope interface.
std::vector<std::string> LocalVarNames() const;

/// ------------------------------------- helper functions for Tensor
/// ----------------------------------
// Create a Tensor variable. This will create a new Variable called `name`.
Tensor* NewTensor(const std::string& name) {
auto* var = Var(name);
return var->GetMutable<Tensor>();
}

const Tensor* FindTensor(const std::string& name) {
auto* var = FindVar(name);
if (!var) return nullptr;
return &var->Get<Tensor>();
}

Tensor* FindMutableTensor(const std::string& name) {
auto* var = FindVar(name);
if (!var) return nullptr;
return var->GetMutable<Tensor>();
}

std::vector<Tensor>* NewTensorList(const std::string& name) {
auto* var = Var(name);
return var->GetMutable<std::vector<Tensor>>();
}

const std::vector<Tensor>* FindTensorList(const std::string& name) {
auto* var = FindVar(name);
if (!var) return nullptr;
return &var->Get<std::vector<Tensor>>();
}

std::vector<Tensor>* FindMutableTensorList(const std::string& name) {
auto* var = FindVar(name);
if (!var) return nullptr;
return var->GetMutable<std::vector<Tensor>>();
}

private:
// Scope in `kids_` are owned by this class.
mutable std::list<Scope*> kids_;
const Scope* parent_{nullptr};
std::map<std::string, std::unique_ptr<Variable>> vars_;
std::unique_ptr<lite::fluid::RWLock> kids_lock_{nullptr};
std::unique_ptr<lite::fluid::RWLock> vars_lock_{nullptr};
std::unique_ptr<lite::fluid::RWLock> rwlock_{nullptr};
};

optimize

optimizer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/*
* lite::Optimizer optimize a program. It utilize the mir passes to analysis the
* program and export an optimized program.
* Example :
* // (1) Create an optimizer
* Optimizer optim(valid_places, kernel_pick_factor);
* // (2) add an optimizer method
* optim.AddPass("post_quant_dynamic_pass");
* // (3) analysis a program to export an optimized program
* auto program_ = optim.Run(std::move(program));
*/
class Optimizer {
public:
Optimizer(const std::vector<Place>& valid_places,
core::KernelPickFactor kernel_pick_factor)
: valid_places_(valid_places), kernel_pick_factor_(kernel_pick_factor) {
CHECK(!valid_places.empty()) << "At least one valid_place should be set";
}

// Append a pass to the optimizer.
void AddPass(const std::string& pass_name);
// Optimize a program to generate a runtime program.
std::unique_ptr<RuntimeProgram> Run(Program&& program);

protected:
// Run all the added passes.
void ApplyPasses(std::vector<std::unique_ptr<mir::SSAGraph>>* graphes);

// Generate the optimized runtime program.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram(
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs);

void InitTargetTypeTransformPass();
void InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
void InitControlFlowOpSharedInputsAndOutputsPlaceSyncPass();
void SpecifyKernelPickTactic(core::KernelPickFactor factor);
Scope* exec_scope() { return exec_scope_; }

private:
std::vector<Place> valid_places_;
Scope* exec_scope_{};
std::vector<mir::Pass*> passes_;
std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
core::KernelPickFactor kernel_pick_factor_;
};

metal buffer anc texture

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
id<MTLDevice> device = MTLCreateSystemDefaultDevice();

MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
[desc setTextureType:MTLTextureType2DArray];
[desc setDepth:1];
desc.width = static_cast<NSUInteger>(dim[2]);
desc.height = static_cast<NSUInteger>(dim[1]);
desc.arrayLength = static_cast<NSUInteger>(((dim[0]) * (dim[3]) + 3) / 4);
desc.pixelFormat = MTLPixelFormatRGBA16Float;
desc.usage = MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite;
desc.storageMode = MTLStorageModeShared;

id<MTLTexture> image_ = [device newTextureWithDescriptor:desc];

int channels_per_pixel_ = 4;
int array_length_ = desc_.arrayLength;

auto count = image_.width * image_.height * array_length_ * channels_per_pixel_;
auto buffer = static_cast<uint16_t*>(malloc(sizeof(uint16_t) * count));

auto bytes_per_row = image_.width * image_.depth * channels_per_pixel_ * sizeof(uint16_t);
auto bytes_per_image = image_.height * bytes_per_row;
const MTLRegion region {
.origin = {0, 0, 0},
.size ={
image_.width, image_.height, image_.depth,
}
};

// copy from cpu to gpu
for (int i = 0; i < array_length_; ++i) {
auto p = buffer + image_.width * image_.height * channels_per_pixel_ * i;
[image_ replaceRegion:region
mipmapLevel:0
slice:static_cast<NSUInteger>(i)
withBytes:p
bytesPerRow:bytes_per_row
bytesPerImage:bytes_per_image];
}

// copy from gpu to cpu
auto* out_buffer = static_cast<uint16_t*>(malloc(sizeof(uint16_t) * count));
for (int i = 0; i < array_length_; ++i) {
auto p = out_buffer + image_.width * image_.height * channels_per_pixel_ * i;

[image_ getBytes:p
bytesPerRow:bytes_per_row
bytesPerImage:bytes_per_image
fromRegion:region
mipmapLevel:0
slice:static_cast<NSUInteger>(i)];
}

swift

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
let device = MTLCreateSystemDefaultDevice()!
let queue = device.makeCommandQueue()!
let textureDescriptor = MTLTextureDescriptor()
textureDescriptor.textureType = .type2D
textureDescriptor.pixelFormat = .r16Uint
textureDescriptor.width = bufferWidth
textureDescriptor.height = 256
textureDescriptor.usage = [.shaderRead, .shaderWrite]


let texture = buffer?.makeTexture(descriptor: textureDescriptor, offset: 0, bytesPerRow: bufferWidth*MemoryLayout<UInt16>.stride)

let texture = device.makeTexture(descriptor: textureDescriptor)
texture?.replace(region: MTLRegionMake2D(0, 0, w, h), mipmapLevel: 0, withBytes: data, bytesPerRow: 4 * w)

# buffer
let count = 1500
var myVector = [Float](repeating: 0, count: count)
var length = count * MemoryLayout< Float >.stride
var outBuffer = device.makeBuffer(bytes: myVector, length: length, options: [])
for (index, value) in myVector.enumerated() { myVector[index] = Float(index) }
var inBuffer = device.makeBuffer(bytes: myVector, length: length, options: [])

本文主要介绍Alibaba MNN编译使用。

下载编译

下载

1
git clone github.com/alibaba/MNN

linux x86

1
2
./schema/generate.sh
mkdir build && cd build && cmake .. && make -j4

refer to https://www.yuque.com/mnn/en/build_linux

模型部署示例

1
2
3
int main(){
return main();
}

content

RVO(Return Value Optimization)

NRVO(Named Return Value Optimization)

code snipts

1
2
3
4
5
6
7
8
9
#include <algorithm>
#include <cctype>

std::string lower(const std::string &data) {
std::string result = data;
std::transform(result.begin(), result.end(), result.begin(),
[](unsigned char c){ return std::tolower(c); });
return result;
}

conv详细介绍。

conv2d

1
2
3
4
5
6
7
8
9
10
11
12
13
inline bool IsExpand(const std::vector<int64_t>& filter_dim,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
1
2
3
// use col_shape in the im2col calculation
// col_shape_vec:
// {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,o_h, o_w}

gemm calc

1
2
// use col_matrix_shape in the gemm calculation size:
// (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
static inline size_t naive_conv_out_size(size_t in_size, size_t pad,
size_t dilation, size_t ksize,
size_t stride) {
return (in_size + 2 * pad - dilation * (ksize - 1) - 1) / stride + 1;
}

static inline void naive_conv_fwd_nchw(const float *src, const float *filter,
float *dst, size_t n, size_t w, size_t h,
size_t c, size_t k, size_t fx, size_t fy,
size_t px, size_t py, size_t sx,
size_t sy, size_t dx, size_t dy, size_t group) {
size_t oh = naive_conv_out_size(h, py, dy, fy, sy);
size_t ow = naive_conv_out_size(w, px, dx, fx, sx);
assert((group >= 1) && (c % group == 0) && (k % group == 0));
size_t k_per_group = k / group;
size_t c_per_group = c / group;
size_t ig, in, ik, ioh, iow, ic, is, ir;
size_t cur_h, cur_w, o_idx, i_idx, f_idx;
// input:[n,c,h,w], filter:[k, c, fx, fy], output: [n, k, out_h, out_w]
for (ig = 0; ig < group; ig++) {
for (in = 0; in < n; in++) {
for (ik = 0; ik < k_per_group; ik++) {
for (ioh = 0; ioh < oh; ioh++) {
for (iow = 0; iow < ow; iow++) {
// sliding window for this filter
float value = .0f;
o_idx = in * k * oh * ow + ig * k_per_group * oh * ow + ik * oh * ow + ioh * ow + iow;
for (ic = 0; ic < c_per_group; ic++) {
for (ir = 0; ir < fy; ir++) {
cur_h = sy * ioh - py + dy * ir;
if (cur_h < 0 || cur_h >= h)
continue;
for (is = 0; is < fx; is++) {
cur_w = sx * iow - px + dx * is;
if (cur_w < 0 || cur_w >= w)
continue;
i_idx = in * c * h * w + ig * c_per_group * h * w + ic * h * w +
cur_h * w + cur_w;
f_idx = ig * k_per_group * c_per_group * fy * fx + ik * c_per_group * fy * fx + ic * fy * fx +
ir * fx + is;
value += src[i_idx] * filter[f_idx];
}
}
}
dst[o_idx] = value;
}
}
}
}
}
}

// group = 1
static inline void naive_conv_fwd_nchw(const float *src, const float *filter,
float *dst, size_t n, size_t w, size_t h,
size_t c, size_t k, size_t fx, size_t fy,
size_t px, size_t py, size_t sx,
size_t sy, size_t dx, size_t dy, size_t group) {
size_t oh = naive_conv_out_size(h, py, dy, fy, sy);
size_t ow = naive_conv_out_size(w, px, dx, fx, sx);
assert((group >= 1) && (c % group == 0) && (k % group == 0));
size_t k_per_group = k / group;
size_t c_per_group = c / group;
size_t ig, in, ik, ioh, iow, ic, is, ir;
size_t cur_h, cur_w, o_idx, i_idx, f_idx;
// input:[n,c,h,w], filter:[k, c, fx, fy], output: [n, k, out_h, out_w]
for (ig = 0; ig < group; ig++) {
for (in = 0; in < n; in++) {
for (ik = 0; ik < k_per_group; ik++) {
for (ioh = 0; ioh < oh; ioh++) {
for (iow = 0; iow < ow; iow++) {
// sliding window for this filter
float value = .0f;
o_idx = in * k * oh * ow + ig * k_per_group * oh * ow + ik * oh * ow + ioh * ow + iow;
for (ic = 0; ic < c_per_group; ic++) {
for (ir = 0; ir < fy; ir++) {
cur_h = sy * ioh - py + dy * ir;
if (cur_h < 0 || cur_h >= h)
continue;
for (is = 0; is < fx; is++) {
cur_w = sx * iow - px + dx * is;
if (cur_w < 0 || cur_w >= w)
continue;
i_idx = in * c * h * w + ig * c_per_group * h * w + ic * h * w +
cur_h * w + cur_w;
f_idx = ig * k_per_group * c_per_group * fy * fx + ik * c_per_group * fy * fx + ic * fy * fx +
ir * fx + is;
value += src[i_idx] * filter[f_idx];
}
}
}
dst[o_idx] = value;
}
}
}
}
}
}
1
2
3
4
5
6
7
// [bs, ic, ih, iw] & pack_size=8 => [bs, ic/8, ih, iw, 8]
// [bs, ic, ih, iw] & pack_size=4 => [bs, ic/4, ih, iw, 4]

// filter [oc, ic, kh, kw] & pack_in=8, pack_out=8 => [oc/8, ic/8, kh, kw, 8, 8]
// filter [oc, ic, kh, kw] & pack_in=4, pack_out=4 => [ic/4, ic/4, kh, kw, 4, 4]

// [bs, ]

conv3d

conv_depthwise

1
2
3
4
5
6
7
8
9
10
11
// [bs, ic, ih, iw] & pack_size=8 => [bs, ic/8, ih, iw, 8]
// [bs, ic, ih, iw] & pack_size=4 => [bs, ic/4, ih, iw, 4]

// filter [oc, ic/groups=1, kh, kw]
// filter [oc, 1, ih, iw] & pack_size=8 => [oc/8, ih, iw, 8]
// filter [oc, 1, ih, iw] & pack_size=4 => [ic/4, ih, iw, 4]

// output [bs, oc, oh, ow]
// output_trans [bs, oc/8, oh, ow, 8]
// output_trans [bs, oc/4, oh, ow, 4]
// [bs, oc/8, oh, ow, 8] => [bs, oc, oh, ow]

std::async 用法

1
2
template<class Fn, class... Args>
future<typename result_of<Fn(Args...)>::type> async(launch policy, Fn&& fn, Args&&...args);
  • std::launch::async
    系统默认,调用时创建新线程,
  • std::launch::deferred
    延迟到std::future调用wait()或者get()时才执行,主线程调用,不创建新线程

std::async 封装

1
2
3
4
5
6
7
8
9
10
11
12
template <typename F, typename... Args>
auto really_async(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
using RetType = typename std::result_of<F(Args...)>::type;
auto func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
std::packaged_task<RetType()> task(std::move(func));
auto fut = task.get_future();
std::thread trd(std::move(task));
trd.detach();
return fut;
}

std::future 用法

std::future_status 三种状态

  • deferred
    异步操作待开始
  • ready
    异步操作完成
  • timeout
    异步操作超时

std::promise

std::packaged_task

c++11 可变模版参数

1
2
template <class... T>
void f(T... args);
  • 递归函数展开参数包
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    #include <iostream>

    template <class... T>
    void f(T... args)
    {
    std::cout << sizeof...(args) << std::endl;
    }

    void func() {}
    template <class T, class... Args>
    void func(T first, Args... remain) {
    std::cout << first << " ";
    if (sizeof...(remain) == 0) return;
    func(remain...);
    }

    int main() {
    func(2, 3, 9);
    return 0;
    }
  • 逗号表达式展开参数包

介绍paddle、ncnn、tnn使用

paddle

x2paddle

x2paddle –framework=onnx –model=onnx_model.onnx –save_dir=mobilenet

paddle2onnx

paddle2onnx –model_dir paddle_model –save_file onnx_file –opset_version 10 –enable_onnx_checker True
paddle2onnx –model_dir paddle_model –model_filename model_filename –params_filename params_filename –save_file onnx_file –opset_version 10 –enable_onnx_checker True