docker

Posted on 2021-12-09 Edited on 2023-10-03

docker build

https://docs.docker.com/engine/reference/builder/

install docker

1 2	sudo yum install -y yum-utils sudo yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo

ubuntu docker

https://askubuntu.com/questions/1140183/install-gcc-9-on-ubuntu-18-04

FROM ubuntu:16.04

LABEL com.zhangjun.image.authors="ewalker.zj@gmail.com"

ENV TZ "Asia/Shanghai"

RUN apt update && \
    apt -qqy install software-properties-common && \
    add-apt-repository -y  ppa:ubuntu-toolchain-r/test && \
    add-apt-repository -y ppa:deadsnakes/ppa && \
    apt update && \
    apt -qqy install gcc-9 g++-9 && \
    apt -qqy install python3.7 && \
    update-alternatives --install /usr/bin/python python /usr/bin/python3.7 10 && \
    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 40 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
    wget https://bootstrap.pypa.io/get-pip.py && \
    python3.7 get-pip.py && \
    python3.7 -m pip install pre-commit && \
    apt-get -qqy clean && \
    rm -rf get-pip.py && rm -rf /var/lib/apt/lists/*

#    update-alternatives --config gcc
#    update-alternatives --config python

macOS tips

Posted on 2021-12-09 Edited on 2023-10-03

macOS python load Framework

https://docs.python.org/3/library/ctypes.html

from ctypes.util import find_library
from ctypes import cdll
import os

print(os.name)
metal_library = find_library("Metal")
core_graphics_library = find_library("CoreGraphics")
mps_library = find_library("MetalPerformanceShaders")

print(cdll.LoadLibrary(metal_library))
print(cdll.LoadLibrary(core_graphics_library))
print(cdll.LoadLibrary(mps_library))

Paddle Lite framework

Posted on 2021-09-19 Edited on 2023-10-03

core

context

class KernelContext {
 public:
  template <typename ContextT>
  ContextT& As() {
    if (!ctx_.valid()) {
      ctx_.set<ContextT>();
    }
    return *ctx_.get_mutable<ContextT>();
  }

 private:
  Any ctx_;
};

class ContextScheduler {
 public:
  static ContextScheduler& Global() {
    static auto* x = new ContextScheduler;
    return *x;
  }
  std::unique_ptr<KernelContext> NewContext(
      TargetType target,
      /*only used for cuda context*/ int exec_stream_id = 0) {
    std::unique_ptr<KernelContext> ctx(new KernelContext);
    switch (target) {
      case TARGET(kHost):
        kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
            &ctx->As<HostContext>());
        break;
    }
    return ctx;
  } 
private:
  template <TargetType Type, typename ContextT>
  void InitContext() {
    kernel_contexts_[Type].As<ContextT>().InitOnce();
  }
  ContextScheduler() {
    InitContext<TargetType::kHost, HostContext>();
  }
private:
  std::map<TargetType, KernelContext> kernel_contexts_;
};

op lite

class OpLite : public Registry {
 public:
  OpLite() = default;
  explicit OpLite(const std::string &type) : op_type_(type) {}
  explicit OpLite(const std::vector<Place> &valid_places)
      : valid_places_(valid_places) {}

  void SetValidPlaces(const std::vector<Place> &places) {
    VLOG(5) << "valid places " << valid_places_.size();
    valid_places_ = places;
  }
  virtual bool Run();
  // Indicate whether the Op runs only once or not
  virtual bool run_once() const { return false; }
  std::string Type() const { return op_type_; }

  // Link the external execution environ to internal context.
  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);

  template <typename T>
  inline void AttachParam(T *param) {
    op_param_ = static_cast<T *>(param);
  }
  // Create all the kernels for the valid targets.
  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
      const std::vector<Place> &places, const std::string &kernel_type = "");

  Scope *scope() { return scope_; }

  // Assign op param to kernel.
  virtual void AttachKernel(KernelBase *kernel) = 0;
  void SetKernel(std::vector<std::unique_ptr<KernelBase>> &kernels) {  // NOLINT
    kernel_ = std::move(kernels.front());
    kernel_->SetContext(
        ContextScheduler::Global().NewContext(kernel_->target()));
  }

  KernelBase *GetKernel() {  // NOLINT
    return kernel_.get();
  }
  virtual ~OpLite() = default;
 protected:
  friend class mir::Node;
  friend class mir::SSAGraph;
 protected:
  Scope *scope_{nullptr};
  std::unique_ptr<KernelBase> kernel_;
  std::string op_type_;
  std::vector<Place> valid_places_;
  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
  std::unique_ptr<OpInfo> op_info_;
  // todo: it's prefered to combine last_input_shapes and
  // last_input_lods into a single hash value to decrease
  // memory usage.
  std::vector<DDimLite> last_input_shapes{};
  std::vector<std::vector<std::vector<uint64_t>>> last_input_lods{};
  std::vector<DDimLite> last_output_shapes{};
  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
  mutable operators::ParamBase *op_param_{nullptr};

 private:
  // Infer Shape according to memory, if current input shapes are consistent
  // with that of previous inputs, output shapes of last time will be reused.
  bool InferShapeWithCache();
};

std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
    const std::vector<Place> &places, const std::string &kernel_type) {
  std::vector<std::unique_ptr<KernelBase>> kernels;
  CHECK(!op_type_.empty()) << "op_type_ should be set first";

  auto pick_kernel = [&](const Place &place) {
    auto ks = KernelRegistry::Global().Create(
        op_type_, place.target, place.precision, place.layout);
    VLOG(5) << "pick kernel for " << op_info()->Type() << " "
            << place.DebugString() << " get " << ks.size() << " kernels";
    for (auto &&it : ks) {
      AttachKernel(it.get());
      kernels.emplace_back(std::move(it));
    }
  };

  if (!kernel_type.empty()) {
    Place place;
    std::string op_type, alias;
    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
    pick_kernel(place);
    CHECK(!kernels.empty()) << "no kernel for kernel type " << kernel_type;
    return kernels;
  }

  std::set<Place> expanded_places(places.begin(), places.end());
  for (auto &place : places) {
    // Pick kernels those support any Precision and any DataLayout, For example:
    // kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny
    expanded_places.insert(
        Place(place.target, place.precision, DATALAYOUT(kAny)));
    expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout));
    expanded_places.insert(
        Place(place.target, PRECISION(kAny), DATALAYOUT(kAny)));
  }

  std::set<TargetType> targets;
  for (auto place : expanded_places) {
    pick_kernel(place);
    targets.insert(place.target);
  }

  VLOG(5) << "op " << op_type_ << " get " << kernels.size() << " kernels";
  return kernels;
}

/*
 * Operator Information, such as some description. It will be shared by all the
 * kernels of the same operator.
 */
class OpInfo : public cpp::OpDesc {
 public:
  OpInfo(const OpInfo &) = default;
  explicit OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {}
};

op registry

class OpKernelInfoCollector {
 public:
  static OpKernelInfoCollector& Global() {
    static auto* x = new OpKernelInfoCollector;
    return *x;
  }
  void AddOp2path(const std::string& op_name, const std::string& op_path);
  void AddKernel2path(const std::string& kernel_name,
                      const std::string& kernel_path);
  
 private:
  std::map<std::string, std::string> op2path_;
  std::map<std::string, std::string> kernel2path_;
};

class OpLiteFactory {
 public:
  // Register a function to create an op
  void RegisterCreator(const std::string& op_type,
                       std::function<std::shared_ptr<OpLite>()> fun) {
    op_registry_[op_type] = fun;
  }

  static OpLiteFactory& Global() {
    static OpLiteFactory* x = new OpLiteFactory;
    return *x;
  }

  std::shared_ptr<OpLite> Create(const std::string& op_type) const {
    auto it = op_registry_.find(op_type);
    if (it == op_registry_.end()) return nullptr;
    return it->second();
  }

  std::string DebugString();

  std::vector<std::string> GetAllOps() const {
    std::vector<std::string> res;
    for (const auto& op : op_registry_) {
      res.push_back(op.first);
    }
    return res;
  }

 protected:
  std::map<std::string, std::function<std::shared_ptr<OpLite>()>> op_registry_;
};

class OpLiteRegistrar {
 public:
  OpLiteRegistrar(const std::string& op_type,
                  std::function<std::shared_ptr<OpLite>()> fun) {
    OpLiteFactory::Global().RegisterCreator(op_type, fun);
  }
  // Touch function is used to guarantee registrar was initialized.
  void touch() {}
};

class KernelFactory {
 public:
  // Register a function to create kernels
  void RegisterCreator(const std::string& op_type,
                       TargetType target,
                       PrecisionType precision,
                       DataLayoutType layout,
                       std::function<std::unique_ptr<KernelBase>()> fun) {
    op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back(
        fun);
  }

  static KernelFactory& Global() {
    static KernelFactory* x = new KernelFactory;
    return *x;
  }

  /**
   * Create all kernels belongs to an op.
   */
  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type) {
    std::list<std::unique_ptr<KernelBase>> res;
    if (op_registry_.find(op_type) == op_registry_.end()) return res;
    auto& kernel_registry = op_registry_[op_type];
    for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) {
      for (auto& fun : it->second) {
        res.emplace_back(fun());
      }
    }
    return res;
  }

  /**
   * Create a specific kernel. Return a list for API compatible.
   */
  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type,
                                                TargetType target,
                                                PrecisionType precision,
                                                DataLayoutType layout) {
    std::list<std::unique_ptr<KernelBase>> res;
    if (op_registry_.find(op_type) == op_registry_.end()) return res;
    auto& kernel_registry = op_registry_[op_type];
    auto it = kernel_registry.find(std::make_tuple(target, precision, layout));
    if (it == kernel_registry.end()) return res;
    for (auto& fun : it->second) {
      res.emplace_back(fun());
    }
    return res;
  }

 protected:
  // Outer map: op -> a map of kernel.
  // Inner map: kernel -> creator function.
  // Each kernel was represented by a combination of <TargetType, PrecisionType,
  // DataLayoutType>
  std::map<std::string,
           std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
                    std::list<std::function<std::unique_ptr<KernelBase>()>>>>
      op_registry_;
};

// Register Kernel by initializing a static KernelRegistrar instance
class KernelRegistrar {
 public:
  KernelRegistrar(const std::string& op_type,
                  TargetType target,
                  PrecisionType precision,
                  DataLayoutType layout,
                  std::function<std::unique_ptr<KernelBase>()> fun) {
    KernelFactory::Global().RegisterCreator(
        op_type, target, precision, layout, fun);
  }
  // Touch function is used to guarantee registrar was initialized.
  void touch() {}
};

class ParamTypeDummyRegistry {
 public:
  struct NewInstance {
    NewInstance() {}
    NewInstance& BindInput(const std::string& arg_name,
                           const ParamType& ptype) {
      return *this;
    }
    NewInstance& BindOutput(const std::string& arg_name,
                            const ParamType& ptype) {
      return *this;
    }
    NewInstance& SetVersion(const std::string& version) { return *this; }
    NewInstance& BindPaddleOpVersion(const std::string& op_type,
                                     int32_t version_id) {
      return *this;
    }
    bool Finalize() { return true; }
  };

 private:
  ParamTypeDummyRegistry() = default;
};

注册机制

op注册

#define REGISTER_LITE_OP(op_type__, OpClass)                                   \
  static paddle::lite::OpLiteRegistrar op_type__##__registry(                  \
      #op_type__, []() {                                                       \
        return std::unique_ptr<paddle::lite::OpLite>(new OpClass(#op_type__)); \
      });                                                                      \
  int touch_op_##op_type__() {                                                 \
    op_type__##__registry.touch();                                             \
    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);          \
    return 0;                                                                  \
  }

kernel 注册

// Register a kernel.
#define REGISTER_LITE_KERNEL(                                                 \
    op_type__, target__, precision__, layout__, KernelClass, alias__)         \
  static paddle::lite::KernelRegistrar                                        \
      op_type__##target__##precision__##layout__##alias__##_kernel_registry(  \
          #op_type__,                                                         \
          TARGET(target__),                                                   \
          PRECISION(precision__),                                             \
          DATALAYOUT(layout__),                                               \
          []() {                                                              \
            std::unique_ptr<KernelClass> x(new KernelClass);                  \
            x->set_op_type(#op_type__);                                       \
            x->set_alias(#alias__);                                           \
            return x;                                                         \
          });                                                                 \
  int touch_##op_type__##target__##precision__##layout__##alias__() {         \
    op_type__##target__##precision__##layout__##alias__##_kernel_registry     \
        .touch();                                                             \
    OpKernelInfoCollector::Global().AddKernel2path(                           \
        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
        __FILE__);                                                            \
    return 0;                                                                 \
  }                                                                           \
  ParamTypeRegistry(                                                          \
      op_type__, target__, precision__, layout__, KernelClass, alias__)

program

scope

class Scope final {
 public:
  Scope()
      : kids_lock_{new lite::fluid::RWLock},
        vars_lock_{new lite::fluid::RWLock},
        rwlock_{new lite::fluid::RWLock} {}
  // delete below two functions to allow pybind to recognise it cannot make a
  // copy
  // link:
  // https://stackoverflow.com/questions/53807248/pybind11-returning-a-pointer-to-a-container-of-unique-ptr
  Scope(const Scope&) = delete;
  Scope& operator=(const Scope&) = delete;
  ~Scope();

  Scope& NewScope() const;

  Variable* Var(const std::string& name);

  Variable* LocalVar(const std::string& name);

  Variable* FindVar(const std::string& name) const;

  Variable* FindLocalVar(const std::string& name) const;

  const Scope* parent() const { return parent_; }

  // Get attribute params stored in parent scopes.
  std::vector<std::string> AttributeVarNames() const;
  // Following the legacy scope interface.
  std::vector<std::string> LocalVarNames() const;

  /// ------------------------------------- helper functions for Tensor
  /// ----------------------------------
  // Create a Tensor variable. This will create a new Variable called `name`.
  Tensor* NewTensor(const std::string& name) {
    auto* var = Var(name);
    return var->GetMutable<Tensor>();
  }

  const Tensor* FindTensor(const std::string& name) {
    auto* var = FindVar(name);
    if (!var) return nullptr;
    return &var->Get<Tensor>();
  }

  Tensor* FindMutableTensor(const std::string& name) {
    auto* var = FindVar(name);
    if (!var) return nullptr;
    return var->GetMutable<Tensor>();
  }

  std::vector<Tensor>* NewTensorList(const std::string& name) {
    auto* var = Var(name);
    return var->GetMutable<std::vector<Tensor>>();
  }

  const std::vector<Tensor>* FindTensorList(const std::string& name) {
    auto* var = FindVar(name);
    if (!var) return nullptr;
    return &var->Get<std::vector<Tensor>>();
  }

  std::vector<Tensor>* FindMutableTensorList(const std::string& name) {
    auto* var = FindVar(name);
    if (!var) return nullptr;
    return var->GetMutable<std::vector<Tensor>>();
  }

 private:
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  const Scope* parent_{nullptr};
  std::map<std::string, std::unique_ptr<Variable>> vars_;
  std::unique_ptr<lite::fluid::RWLock> kids_lock_{nullptr};
  std::unique_ptr<lite::fluid::RWLock> vars_lock_{nullptr};
  std::unique_ptr<lite::fluid::RWLock> rwlock_{nullptr};
};

optimize

optimizer

/*
 * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
 * program and export an optimized program.
 * Example :
 *       // (1) Create an optimizer
 *       Optimizer optim(valid_places, kernel_pick_factor);
 *       // (2) add an optimizer method
 *       optim.AddPass("post_quant_dynamic_pass");
 *       // (3) analysis a program to export an optimized program
 *       auto program_ = optim.Run(std::move(program));
 */
class Optimizer {
 public:
  Optimizer(const std::vector<Place>& valid_places,
            core::KernelPickFactor kernel_pick_factor)
      : valid_places_(valid_places), kernel_pick_factor_(kernel_pick_factor) {
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
  }

  // Append a pass to the optimizer.
  void AddPass(const std::string& pass_name);
  // Optimize a program to generate a runtime program.
  std::unique_ptr<RuntimeProgram> Run(Program&& program);

 protected:
  // Run all the added passes.
  void ApplyPasses(std::vector<std::unique_ptr<mir::SSAGraph>>* graphes);

  // Generate the optimized runtime program.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram(
      std::vector<std::unique_ptr<mir::SSAGraph>>* graphs);

  void InitTargetTypeTransformPass();
  void InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
  void InitControlFlowOpSharedInputsAndOutputsPlaceSyncPass();
  void SpecifyKernelPickTactic(core::KernelPickFactor factor);
  Scope* exec_scope() { return exec_scope_; }

 private:
  std::vector<Place> valid_places_;
  Scope* exec_scope_{};
  std::vector<mir::Pass*> passes_;
  std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
  core::KernelPickFactor kernel_pick_factor_;
};

Metal Basic

Posted on 2021-09-05 Edited on 2023-10-03

metal buffer anc texture

id<MTLDevice> device = MTLCreateSystemDefaultDevice();

MTLTextureDescriptor* desc = [[MTLTextureDescriptor alloc] init];
[desc setTextureType:MTLTextureType2DArray];
[desc setDepth:1];
desc.width = static_cast<NSUInteger>(dim[2]);
desc.height = static_cast<NSUInteger>(dim[1]);
desc.arrayLength = static_cast<NSUInteger>(((dim[0]) * (dim[3]) + 3) / 4);
desc.pixelFormat = MTLPixelFormatRGBA16Float;
desc.usage = MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite;
desc.storageMode = MTLStorageModeShared;

id<MTLTexture> image_ = [device newTextureWithDescriptor:desc];

int channels_per_pixel_ = 4;
int array_length_ = desc_.arrayLength;

auto count = image_.width * image_.height * array_length_ * channels_per_pixel_;
auto buffer = static_cast<uint16_t*>(malloc(sizeof(uint16_t) * count));

auto bytes_per_row = image_.width * image_.depth * channels_per_pixel_ * sizeof(uint16_t);
auto bytes_per_image = image_.height * bytes_per_row;
const MTLRegion region {
    .origin = {0, 0, 0},
    .size ={
        image_.width, image_.height, image_.depth,
    }
};

// copy from cpu to gpu
for (int i = 0; i < array_length_; ++i) {
    auto p = buffer + image_.width * image_.height * channels_per_pixel_ * i;
    [image_ replaceRegion:region
              mipmapLevel:0
                    slice:static_cast<NSUInteger>(i)
                withBytes:p
              bytesPerRow:bytes_per_row
            bytesPerImage:bytes_per_image];
}

// copy from gpu to cpu
auto* out_buffer = static_cast<uint16_t*>(malloc(sizeof(uint16_t) * count));
for (int i = 0; i < array_length_; ++i) {
    auto p = out_buffer + image_.width * image_.height * channels_per_pixel_ * i;

    [image_ getBytes:p
         bytesPerRow:bytes_per_row
       bytesPerImage:bytes_per_image
          fromRegion:region
         mipmapLevel:0
               slice:static_cast<NSUInteger>(i)];
}

swift

let device = MTLCreateSystemDefaultDevice()!
let queue = device.makeCommandQueue()!
let textureDescriptor = MTLTextureDescriptor()
textureDescriptor.textureType = .type2D
textureDescriptor.pixelFormat = .r16Uint
textureDescriptor.width = bufferWidth
textureDescriptor.height = 256
textureDescriptor.usage = [.shaderRead, .shaderWrite]


let texture = buffer?.makeTexture(descriptor: textureDescriptor, offset: 0, bytesPerRow: bufferWidth*MemoryLayout<UInt16>.stride)

let texture = device.makeTexture(descriptor: textureDescriptor)
texture?.replace(region: MTLRegionMake2D(0, 0, w, h), mipmapLevel: 0, withBytes: data, bytesPerRow: 4 * w)

# buffer
let count = 1500
var myVector = [Float](repeating: 0, count: count)
var length = count * MemoryLayout< Float >.stride
var outBuffer = device.makeBuffer(bytes: myVector, length: length, options: [])
for (index, value) in myVector.enumerated() { myVector[index] = Float(index) }
var inBuffer = device.makeBuffer(bytes: myVector, length: length, options: [])

MNN

Posted on 2021-06-07 Edited on 2023-10-03

本文主要介绍Alibaba MNN编译使用。

下载编译

下载

1	git clone github.com/alibaba/MNN

linux x86

1 2	./schema/generate.sh mkdir build && cd build && cmake .. && make -j4

refer to https://www.yuque.com/mnn/en/build_linux

模型部署示例

1
2
3

int main(){
  return main();
}

gcc

Posted on 2021-04-14 Edited on 2023-10-03

content

RVO(Return Value Optimization)

NRVO(Named Return Value Optimization)

code snipts

#include <algorithm>
#include <cctype>

std::string lower(const std::string &data) {
  std::string result = data;
  std::transform(result.begin(), result.end(), result.begin(),
    [](unsigned char c){ return std::tolower(c); });
  return result;
}

conv

Posted on 2021-04-10 Edited on 2023-10-03

conv详细介绍。

conv2d

inline bool IsExpand(const std::vector<int64_t>& filter_dim,
                     const std::vector<int>& strides,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations) {
  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
  for (size_t j = 0; j < strides.size(); ++j) {
    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
    strides_1 = strides_1 && (strides[j] == 1);
    padding_0 = padding_0 && (paddings[j] == 0);
    dilation_1 = dilation_1 && (dilations[j] == 1);
  }
  return !(filter_1 && strides_1 && padding_0 && dilation_1);
}

1
2
3

// use col_shape in the im2col calculation
// col_shape_vec:
// {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,o_h, o_w}

gemm calc

1 2	// use col_matrix_shape in the gemm calculation size: // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)

static inline size_t naive_conv_out_size(size_t in_size, size_t pad,
                                         size_t dilation, size_t ksize,
                                         size_t stride) {
    return (in_size + 2 * pad - dilation * (ksize - 1) - 1) / stride + 1;
}

static inline void naive_conv_fwd_nchw(const float *src, const float *filter,
                                       float *dst, size_t n, size_t w, size_t h,
                                       size_t c, size_t k, size_t fx, size_t fy,
                                       size_t px, size_t py, size_t sx,
                                       size_t sy, size_t dx, size_t dy, size_t group) {
    size_t oh = naive_conv_out_size(h, py, dy, fy, sy);
    size_t ow = naive_conv_out_size(w, px, dx, fx, sx);
    assert((group >= 1) && (c % group == 0) && (k % group == 0));
    size_t k_per_group = k / group;
    size_t c_per_group = c / group;
        size_t ig, in, ik, ioh, iow, ic, is, ir;
    size_t cur_h, cur_w, o_idx, i_idx, f_idx;
    // input:[n,c,h,w], filter:[k, c, fx, fy], output: [n, k, out_h, out_w]
    for (ig = 0; ig < group; ig++) {
        for (in = 0; in < n; in++) {
            for (ik = 0; ik < k_per_group; ik++) {
                for (ioh = 0; ioh < oh; ioh++) {
                    for (iow = 0; iow < ow; iow++) {
                        // sliding window for this filter
                        float value = .0f;
                        o_idx = in * k * oh * ow + ig * k_per_group * oh * ow + ik * oh * ow + ioh * ow + iow;
                        for (ic = 0; ic < c_per_group; ic++) {
                            for (ir = 0; ir < fy; ir++) {
                                cur_h = sy * ioh - py + dy * ir;
                                if (cur_h < 0 || cur_h >= h)
                                    continue;
                                for (is = 0; is < fx; is++) {
                                    cur_w = sx * iow - px + dx * is;
                                    if (cur_w < 0 || cur_w >= w)
                                        continue;
                                    i_idx = in * c * h * w + ig * c_per_group * h * w + ic * h * w +
                                            cur_h * w + cur_w;
                                    f_idx = ig * k_per_group * c_per_group * fy * fx + ik * c_per_group * fy * fx + ic * fy * fx +
                                            ir * fx + is;
                                    value += src[i_idx] * filter[f_idx];
                                }
                            }
                        }
                        dst[o_idx] = value;
                    }
                }
            }
        }
    }
}

// group = 1
static inline void naive_conv_fwd_nchw(const float *src, const float *filter,
                                       float *dst, size_t n, size_t w, size_t h,
                                       size_t c, size_t k, size_t fx, size_t fy,
                                       size_t px, size_t py, size_t sx,
                                       size_t sy, size_t dx, size_t dy, size_t group) {
    size_t oh = naive_conv_out_size(h, py, dy, fy, sy);
    size_t ow = naive_conv_out_size(w, px, dx, fx, sx);
    assert((group >= 1) && (c % group == 0) && (k % group == 0));
    size_t k_per_group = k / group;
    size_t c_per_group = c / group;
        size_t ig, in, ik, ioh, iow, ic, is, ir;
    size_t cur_h, cur_w, o_idx, i_idx, f_idx;
    // input:[n,c,h,w], filter:[k, c, fx, fy], output: [n, k, out_h, out_w]
    for (ig = 0; ig < group; ig++) {
        for (in = 0; in < n; in++) {
            for (ik = 0; ik < k_per_group; ik++) {
                for (ioh = 0; ioh < oh; ioh++) {
                    for (iow = 0; iow < ow; iow++) {
                        // sliding window for this filter
                        float value = .0f;
                        o_idx = in * k * oh * ow + ig * k_per_group * oh * ow + ik * oh * ow + ioh * ow + iow;
                        for (ic = 0; ic < c_per_group; ic++) {
                            for (ir = 0; ir < fy; ir++) {
                                cur_h = sy * ioh - py + dy * ir;
                                if (cur_h < 0 || cur_h >= h)
                                    continue;
                                for (is = 0; is < fx; is++) {
                                    cur_w = sx * iow - px + dx * is;
                                    if (cur_w < 0 || cur_w >= w)
                                        continue;
                                    i_idx = in * c * h * w + ig * c_per_group * h * w + ic * h * w +
                                            cur_h * w + cur_w;
                                    f_idx = ig * k_per_group * c_per_group * fy * fx + ik * c_per_group * fy * fx + ic * fy * fx +
                                            ir * fx + is;
                                    value += src[i_idx] * filter[f_idx];
                                }
                            }
                        }
                        dst[o_idx] = value;
                    }
                }
            }
        }
    }
}

// [bs, ic, ih, iw] & pack_size=8 => [bs, ic/8, ih, iw, 8]
// [bs, ic, ih, iw] & pack_size=4 => [bs, ic/4, ih, iw, 4]

// filter [oc, ic, kh, kw] & pack_in=8, pack_out=8 => [oc/8, ic/8, kh, kw, 8, 8]
// filter [oc, ic, kh, kw] & pack_in=4, pack_out=4 => [ic/4, ic/4, kh, kw, 4, 4]

// [bs, ]

conv3d

conv_depthwise

// [bs, ic, ih, iw] & pack_size=8 => [bs, ic/8, ih, iw, 8]
// [bs, ic, ih, iw] & pack_size=4 => [bs, ic/4, ih, iw, 4]

// filter [oc, ic/groups=1, kh, kw]
// filter [oc, 1, ih, iw] & pack_size=8 => [oc/8, ih, iw, 8]
// filter [oc, 1, ih, iw] & pack_size=4 => [ic/4, ih, iw, 4]

// output [bs, oc, oh, ow]
// output_trans [bs, oc/8, oh, ow, 8]
// output_trans [bs, oc/4, oh, ow, 4]
// [bs, oc/8, oh, ow, 8] => [bs, oc, oh, ow]

std::async、std::future

Posted on 2021-04-05 Edited on 2023-10-03

std::async 用法

1 2	template<class Fn, class... Args> future<typename result_of<Fn(Args...)>::type> async(launch policy, Fn&& fn, Args&&...args);

std::launch::async
系统默认，调用时创建新线程,
std::launch::deferred
延迟到std::future调用wait()或者get()时才执行，主线程调用，不创建新线程

std::async 封装

template <typename F, typename... Args>
auto really_async(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>
{
    using RetType = typename std::result_of<F(Args...)>::type;
    auto func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
    std::packaged_task<RetType()> task(std::move(func));
    auto fut = task.get_future();
    std::thread trd(std::move(task));
    trd.detach();
    return fut;
}

std::future 用法

std::future_status 三种状态

deferred
异步操作待开始
ready
异步操作完成
timeout
异步操作超时

std::promise

std::packaged_task

可变模版参数

Posted on 2021-04-04 Edited on 2023-10-03

c++11 可变模版参数

1 2	template <class... T> void f(T... args);

递归函数展开参数包

#include <iostream>

template <class... T>
void f(T... args)
{    
  std::cout << sizeof...(args) << std::endl;
}

void func() {}
template <class T, class... Args>
void func(T first, Args... remain) {
  std::cout << first << " ";
  if (sizeof...(remain) == 0) return;
  func(remain...);
}

int main() {
  func(2, 3, 9);
  return 0;
}

逗号表达式展开参数包

models

Posted on 2021-03-23 Edited on 2023-10-03

介绍paddle、ncnn、tnn使用

paddle

x2paddle

x2paddle –framework=onnx –model=onnx_model.onnx –save_dir=mobilenet

paddle2onnx

paddle2onnx –model_dir paddle_model –save_file onnx_file –opset_version 10 –enable_onnx_checker True
paddle2onnx –model_dir paddle_model –model_filename model_filename –params_filename params_filename –save_file onnx_file –opset_version 10 –enable_onnx_checker True