cuda手搓CNN识别手写数字

英伟达提出了cuda框架，用以实现gpu变成。cuda c以c语言为基础，目前的cuda编译器已经能够支持c++17的语法。但是cuda c的基础语法还是只能使用C。
最近结合使用C++模板编程和cuda c，手搓了一个CNN。其中矩阵的点乘是依据之前博客提出的原理、激活函数和更新方法延续了之前模板元编程的思路，并且复用了一部分原来的代码。cuda的矩阵加和、softmax等都是在AI生成代码上修改而来（笔者使用的事github.copilot，这个东西写出来的代码不能全信，因为它考虑的太过片面）。
笔者使用的显卡是Nvidia Tesla P4。这是一块玩具卡，市面上的价格在250块以下，相当实惠。
在这里插入图片描述
下面简短地贴一下主干的代码，首先是卷积层：

#ifndef __CNN_BASE_HPP__
#define __CNN_BASE_HPP__
#include "mat.hpp"
#include "bp_network.hpp"/* 卷积层 */
template<int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, typename val_t = double>
struct conv_layer 
{using tpl_type = mat<tpl_row, tpl_col, val_t>;using input_type = mat<input_row, input_col, val_t>;using pad_type = mat<input_row + get_pad_size(input_row, tpl_row, row_step), input_col + get_pad_size(input_col, tpl_col, col_step), val_t>;using pad_size = pad_size_t<input_row, input_col, tpl_row, tpl_col, row_step, col_step>;using ret_type = decltype(inner_conv<row_step, col_step>(input_type().template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>(), tpl_type()));tpl_type mt_tpl;update_method_templ<mat<tpl_row, tpl_col, val_t>>	um_tpl;pad_type mt_input;ret_type mt_bias;update_method_templ<ret_type>	um_bias;activate_func<ret_type>	act_func;conv_layer(){mt_tpl.template reset<tpl_init_method>();mt_bias.template reset<tpl_init_method>();}inline ret_type forward(const input_type& mt){mt_input = mt.template pad<pad_size::top, pad_size::left, pad_size::right, pad_size::bottom>();ret_type mt1 = inner_conv<row_step, col_step>(mt_input, mt_tpl);return act_func.forward( mt1 + mt_bias);}inline input_type backward(const ret_type& mt_delta) {auto mt_delta_deact = act_func.backward(mt_delta);auto mt_delta_span = mt_delta_deact.template span<row_step - 1, col_step - 1>();			// 采用了步长运算，等于有一些没计算，所以反向传播时候的贡献是0using ret_pad_type = decltype(mt_delta_span);/* 计算反向传播误差 *//* 计算返回阵需要pad的大小 */constexpr int target_r = tpl_row + pad_type::row_num - 1;constexpr int target_c = tpl_col + pad_type::col_num - 1;constexpr int pad_top = (target_r - ret_pad_type::row_num) / 2;constexpr int pad_left = (target_c - ret_pad_type::col_num) / 2;constexpr int pad_right = (target_c - ret_pad_type::col_num) - pad_left;constexpr int pad_bottom = (target_r - ret_pad_type::row_num) - pad_top;auto mt_delta_span_pad = mt_delta_span.template pad<pad_top, pad_left, pad_right, pad_bottom>();auto mt_tpl_rot = mt_tpl.rot180();auto mt_ret_pad = inner_conv<1, 1, target_r, target_c, tpl_row, tpl_col, val_t>(mt_delta_span_pad, mt_tpl_rot);input_type mt_ret;mt_ret.template assign<-1 * pad_size::top, -1 * pad_size::left>(mt_ret_pad);			// 剪除外边/* 计算卷积核更新 */auto mt_update = inner_conv<1, 1, pad_type::row_num, pad_type::col_num, ret_pad_type::row_num, ret_pad_type::col_num, val_t>(mt_input, mt_delta_span);if (mt_update.max_abs() != 0)mt_update = mt_update / mt_update.max_abs();mt_tpl = um_tpl.update(mt_tpl, mt_update);mt_bias = mt_bias - um_bias.update(mt_delta_deact, mt_delta_deact);/* 将模板均值置0，最大波动范围为1 */double d_mean = mt_tpl.sum() / (tpl_row * tpl_col);mt_tpl = mt_tpl - d_mean;if (mt_tpl.max_abs() != 0)mt_tpl = mt_tpl / mt_tpl.max_abs();return mt_ret;}void update_inert() {um_tpl.update_inert();um_bias.update_inert();}void print() {printf("<template>\r\n");mt_tpl.print();printf("<bias>\r\n");mt_bias.print();}static void print_type() {printf("conv_layer<%d, %d, %d, %d, %d, %d> ", input_row, input_col, tpl_row, tpl_col, row_step, col_step);input_type::print_type();}
};/* 多通道、多核的卷积层 輸入是這個樣子的：C = |C1||C2||C3|经过卷积核之后生成一个矩阵数组A = {CK1,CK2,CK3}将数组中的矩阵纵向排列形成一个大的矩阵，作为加权层的输入I = |CK1||CK2||CK3|通过加权层得到输出，并且经过激活函数得到最终的输出O = activate(W * I + B)所以最终O是CK相同维度的一个矩阵fix:--------------------------------------------------------------------------将A矩阵横向排列，然后通过加权层得到输出I=|CK1,CK2,CK3|=|C1K1, C1K2, C1K3||C2K1, C2K2, C2K3||C3K1, C3K2, C3K3|然后经过加权层输出得到OO = |sum(Wri * CiK1 + B), sum(Wri * CiK2 + B), sum(Wri * CiK3 + B)|---> Kernel= |sum(W1i * C1K1 + B), sum(W1i * C1K2 + B), sum(W1i * C1K3 + B)| ||sum(W2i * CiK1 + B), sum(W2i * CiK2 + B), sum(W2i * CiK3 + B)| | Weight|sum(W3i * CiK1 + B), sum(W3i * CiK2 + B), sum(W3i * CiK3 + B)| V最终输出的是一个增倍的矩阵。其中每个Wri都是一个矩阵。
*/
template<typename val_type, int tpl_num, int input_row, int input_col, int tpl_row, int tpl_col, int row_step, int col_step, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method
>
struct conv_with_weight
{using conv_type = conv_layer<input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method, val_type>;using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;using conv_ret_type = typename conv_type::ret_type;//using weight_type = typename bp_network<conv_ret_type::col_num, ReLu, val_type, nadam, he_mean_type, conv_ret_type::row_num * tpl_num, conv_ret_type::row_num>;//using ret_type = typename weight_type::ret_type;//weight_type weight_layer;using ret_type = mat<conv_ret_type::row_num*tpl_num, conv_ret_type::col_num, val_type>;conv_type tpls[tpl_num];static void print_type(){//weight_type::print_type();}template<int N>void join(ret_type& mt, const conv_ret_type* mt_each){mt.template assign<N*conv_ret_type::row_num, 0>(mt_each[N]);if constexpr(N < tpl_num - 1)join<N + 1>(mt, mt_each);}// 将数组分割成多个矩阵template<int N>void split(const ret_type& mt, conv_ret_type* mt_each){mt_each[N].template assign<-1 * N*conv_ret_type::row_num, 0>(mt);if constexpr(N < tpl_num - 1)split<N + 1>(mt, mt_each);}ret_type forward(const input_type& mt){typename conv_type::ret_type ret[tpl_num];for (int j = 0; j < tpl_num; ++j) {ret[j] = tpls[j].forward(mt);}// 定义一个输出矩阵，用于存储卷积后的结果//typename weight_type::input_type mt_ret;ret_type mt_ret;join<0>(mt_ret, ret);return mt_ret;//return weight_layer.forward(mt_ret);}input_type backward(const ret_type& delta) {input_type ret;typename conv_type::ret_type ret_delta[tpl_num];split<0>(delta, ret_delta);for (int j = 0; j < tpl_num; ++j) {ret = ret + tpls[j].backward(ret_delta[j]);}ret = ret / (val_type)tpl_num;return ret;}void update_inert() {for (int i = 0; i < tpl_num; ++i) {tpls[i].update_inert();}//weight_layer.update_inert();}
};#endif

要说为什么一定要用模板编程。其实并不是因为模板编程更快，相反，有的时候模板编程的速度要比动态的要慢，因为现在的编译器（至少gcc 4.8.5）不能很好地进行优化，比如对于模板函数的递归调用，不能智能地展开，而真的就是傻傻地一个一个调用。模板编程的好处在于在编译期就能找到一些静态的问题，比如，如果你的网络结构有问题（使用模板编程这个问题在设计的时候就能解决掉），在编译的时候你就能察觉，而不需要很麻烦的跑去调试。
回到正题，下面是卷积神经网络的代码：

#ifndef __CNN_NETWORK_HPP__
#define __CNN_NETWORK_HPP__
#include "cnn_base.hpp"
#include "pool_layer.hpp"
#include "bp_network.hpp"/*
卷积神经网络
卷积加权层  W
池化层  P
全连接层  O
I --> W1 --> P1 --> W2 --> P2 --> W3 --> P3 --> ... --> Wn --> Pn --> O*/template<typename val_type, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, int input_row, int input_col          // 输入矩阵行数，列数, int tpl_num, int tpl_row, int tpl_col              // 模板行数，列数, int row_step, int col_step            // 行步长，列步长, int pool_row, int pool_col            // 池化行数，列数>
struct conv_pool_layer
{using conv_type = conv_with_weight<val_type, tpl_num, input_row, input_col, tpl_row, tpl_col, row_step, col_step, update_method_templ, activate_func, tpl_init_method>;using pool_type = pool_layer_max<conv_type::ret_type::row_num, conv_type::ret_type::col_num, pool_row, pool_col, val_type>;using input_type = mat<input_row, input_col, typename conv_type::input_type::type>;using ret_type = typename pool_type::ret_type;conv_type mt_conv;pool_type mt_pool;ret_type forward(const input_type& mt){return mt_pool.forward(mt_conv.forward(mt));}input_type backward(const ret_type& mt){return mt_conv.backward(mt_pool.backward(mt));}void update_inert(){mt_conv.update_inert();}void print(){mt_conv.print();mt_pool.print();}static void print_type(){std::cout << "---------- conv-pool layer" << std::endl;std::cout << "tpl_num: " << tpl_num << std::endl;std::cout << "input_row: " << input_row << std::endl;std::cout << "input_col: " << input_col << std::endl;std::cout << "tpl_row: " << tpl_row << std::endl;std::cout << "tpl_col: " << tpl_col << std::endl;std::cout << "row_step: " << row_step << std::endl;std::cout << "col_step: " << col_step << std::endl;std::cout << "pool_row: " << pool_row << std::endl;std::cout << "pool_col: " << pool_col << std::endl;}
};// 堆叠卷积池化层
template<typename val_type, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, int input_row, int input_col/* 循环部分 */, int tpl_num , int tpl_row, int tpl_col, int row_step, int col_step, int pool_row, int pool_col, int... remain_layer>
struct stack_conv_pool_layer
{using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;using next_node_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, conv_pool_type::ret_type::row_num, conv_pool_type::ret_type::col_num, remain_layer...>;using input_type = typename conv_pool_type::input_type;using ret_type = typename next_node_type::ret_type;conv_pool_type mt_conv_pool;next_node_type next_node;ret_type forward(const typename conv_pool_type::input_type& mt){return next_node.forward(mt_conv_pool.forward(mt));}typename conv_pool_type::input_type backward(const ret_type& delta){return mt_conv_pool.backward(next_node.backward(delta));}void update_inert(){mt_conv_pool.update_inert();next_node.update_inert();}void print(){mt_conv_pool.print();next_node.print();}static void print_type(){std::cout << "---------- stack conv-pool layer" << std::endl;std::cout << "tpl_num: " << tpl_num << std::endl;std::cout << "input_row: " << input_row << std::endl;std::cout << "input_col: " << input_col << std::endl;std::cout << "tpl_row: " << tpl_row << std::endl;std::cout << "tpl_col: " << tpl_col << std::endl;std::cout << "row_step: " << row_step << std::endl;std::cout << "col_step: " << col_step << std::endl;std::cout << "pool_row: " << pool_row << std::endl;std::cout << "pool_col: " << pool_col << std::endl;std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;next_node_type::print_type();}
};// 堆叠卷积池化层
template<typename val_type, template<typename> class update_method_templ, template<typename> class activate_func, typename tpl_init_method, int input_row, int input_col, int tpl_num , int tpl_row, int tpl_col, int row_step, int col_step, int pool_row, int pool_col>
struct stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>
{using conv_pool_type = conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col>;using input_type = typename conv_pool_type::input_type;using ret_type = typename conv_pool_type::ret_type;conv_pool_type mt_conv_pool;ret_type forward(const typename conv_pool_type::input_type& mt){return mt_conv_pool.forward(mt);}typename conv_pool_type::input_type backward(const ret_type& delta){return mt_conv_pool.backward(delta);}void update_inert(){mt_conv_pool.update_inert();}void print(){mt_conv_pool.print();}static void print_type(){std::cout << "---------- stack conv-pool layer" << std::endl;std::cout << "tpl_num: " << tpl_num << std::endl;std::cout << "input_row: " << input_row << std::endl;std::cout << "input_col: " << input_col << std::endl;std::cout << "tpl_row: " << tpl_row << std::endl;std::cout << "tpl_col: " << tpl_col << std::endl;std::cout << "row_step: " << row_step << std::endl;std::cout << "col_step: " << col_step << std::endl;std::cout << "pool_row: " << pool_row << std::endl;std::cout << "pool_col: " << pool_col << std::endl;std::cout << "remain_layer: 0" << std::endl;}
};template<typename val_type/* 全连接层参数 */, template<typename> class fullcon_activate_type        // 全连接层激活函数, template<typename> class fullcon_update_method, typename fullcon_init_method, int fullcon_output_num/* 卷积池化层参数 */, template<typename> class activate_func, template<typename> class update_method_templ, typename tpl_init_method/* 全连接层网络结构 */, int input_row, int input_col/* 循环部分 */, int tpl_num , int tpl_row, int tpl_col, int row_step, int col_step, int pool_row, int pool_col, int... remain_layer>
struct cnn_network
{using conv_pool_type = stack_conv_pool_layer<val_type, update_method_templ, activate_func, tpl_init_method, input_row, input_col, tpl_num, tpl_row, tpl_col, row_step, col_step, pool_row, pool_col, remain_layer...>;using fullcon_type = bp_network<1, fullcon_activate_type, val_type, fullcon_update_method, fullcon_init_method, conv_pool_type::ret_type::size, fullcon_output_num>;using input_type = typename conv_pool_type::input_type;using output_type = typename fullcon_type::output_type;using ret_type = typename output_type;conv_pool_type mt_conv_pool;fullcon_type mt_fullcon;output_type forward(const input_type& mt){return mt_fullcon.forward(mt_conv_pool.forward(mt).to_vector());}input_type backward(const output_type& delta){typename conv_pool_type::ret_type mt_conv_pool_delta;mt_fullcon.backward(delta).to_matrix(mt_conv_pool_delta);return mt_conv_pool.backward(mt_conv_pool_delta);}void update_inert(){mt_conv_pool.update_inert();mt_fullcon.update_inert();}void print(){mt_conv_pool.print();mt_fullcon.print();}static void print_type(){std::cout << "---------- cnn network" << std::endl;std::cout << "fullcon_output_num: " << fullcon_output_num << std::endl;std::cout << "tpl_num: " << tpl_num << std::endl;std::cout << "input_row: " << input_row << std::endl;std::cout << "input_col: " << input_col << std::endl;std::cout << "tpl_row: " << tpl_row << std::endl;std::cout << "tpl_col: " << tpl_col << std::endl;std::cout << "row_step: " << row_step << std::endl;std::cout << "col_step: " << col_step << std::endl;std::cout << "pool_row: " << pool_row << std::endl;std::cout << "pool_col: " << pool_col << std::endl;std::cout << "remain_layer: " << sizeof...(remain_layer)/7 << std::endl;conv_pool_type::print_type();fullcon_type::print_type();}
};#endif

接下来是测试的代码，用的是minist手写数字数据集。

#include <vector>
#include <iostream>
#include <string>#include "ht_memory.h"
#include "matrix.hpp"
#include "cnn_network.hpp"struct train_data 
{matrix_host<28, 28, double> mt_image;matrix_host<10, 1, double> mt_label;int					i_num;train_data():mt_image(), mt_label(){i_num = 0;}train_data(const train_data &td){mt_image = td.mt_image;mt_label = td.mt_label;i_num = td.i_num;}train_data &operator=(const train_data &td){mt_image = td.mt_image;mt_label = td.mt_label;i_num = td.i_num;return *this;}train_data(train_data &&td){mt_image = std::move(td.mt_image);mt_label = std::move(td.mt_label);i_num = td.i_num;}train_data &operator=(train_data &&td){mt_image = std::move(td.mt_image);mt_label = std::move(td.mt_label);i_num = td.i_num;return *this;}
};int main()
{unsigned char sz_image_buf[28 * 28];std::vector<train_data> vec_train_data;ht_memory mry_train_images(ht_memory::big_endian);mry_train_images.read_file("./data/train-images.idx3-ubyte");int32_t i_image_magic_num = 0, i_image_num = 0, i_image_col_num = 0, i_image_row_num = 0;mry_train_images >> i_image_magic_num >> i_image_num >> i_image_row_num >> i_image_col_num;printf("magic num:%d | image num:%d | image_row:%d | image_col:%d\r\n", i_image_magic_num, i_image_num, i_image_row_num, i_image_col_num);ht_memory mry_train_labels(ht_memory::big_endian);mry_train_labels.read_file("./data/train-labels.idx1-ubyte");int32_t i_label_magic_num = 0, i_label_num = 0;mry_train_labels >> i_label_magic_num >> i_label_num;printf("magic num:%d | label num:%d\r\n", i_label_magic_num, i_label_num);for (int i = 0; i < i_image_num; ++i){memset(sz_image_buf, 0, sizeof(sz_image_buf));train_data td;unsigned char uc_label = 0;mry_train_images.read((char *)sz_image_buf, sizeof(sz_image_buf));td.mt_image.set_data(sz_image_buf);mry_train_labels >> uc_label;td.i_num = uc_label;td.mt_label.get((const int)uc_label, 0) = 1;vec_train_data.push_back(td);}// 训练参数数量输入std::string str_train_times;std::cout << "train times:";std::getline(std::cin, str_train_times);int i_train_times = std::stol(str_train_times);std::cout << "train data set size:";std::string str_train_data_set_size;std::getline(std::cin, str_train_data_set_size);int i_train_data_set_size = std::stol(str_train_data_set_size);std::cout << "how many times should we update inert? ";std::string str_repeat_times;std::getline(std::cin, str_repeat_times);int i_repeat_times = std::stol(str_repeat_times);std::cout << "how many times should we show the result? ";std::string str_show_times;std::getline(std::cin, str_show_times);int i_show_times = std::stol(str_show_times);std::cout << "when correct rate reach threshold to stop?:";std::string str_repeat_threshold;std::getline(std::cin, str_repeat_threshold);double dthreshold = std::stod(str_repeat_threshold);// 打乱训练数据std::random_device rd;std::mt19937 rng(rd());std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);// 取出训练数据集std::vector<train_data> vec_train_data_set;for (int i = 0; i < i_train_data_set_size; ++i){vec_train_data_set.push_back(vec_train_data[i]);}using cnn_type = cnn_network<double, softmax, nadam, xavier_gaussian_type, 10, ReLu, nadam, he_gaussian_type, 28, 28, 16, 5, 5, 1, 1, 2, 2, 32, 5, 5, 1, 1, 2, 2>;cnn_type cnn;printf("****** cnn network ******\r\n");cnn.print_type();printf("###### cnn network ######\r\n");mat<28, 28, double>** sz_images = new mat<28,28,double>*[i_train_data_set_size];mat<10, 1, double>** sz_labels = new mat<10, 1, double>*[i_train_data_set_size];memset(sz_images, 0, sizeof(mat<28, 28, double>*) * i_train_data_set_size);memset(sz_labels, 0, sizeof(mat<10, 1, double>*) * i_train_data_set_size);int i = 0;int i_correct = 0;for (; i < i_train_times; ++i){i_correct = 0;for (int j = 0; j < i_train_data_set_size; ++j){auto &td = vec_train_data_set[j];if (sz_images[j] == nullptr){mat<28, 28, double> mt_image(td.mt_image);sz_images[j] = new mat<28, 28, double>(mt_image/256.0);}auto output = cnn.forward(*sz_images[j]);if (sz_labels[j] == nullptr){sz_labels[j] = new mat<10, 1, double>(td.mt_label);}auto delta = (output - *sz_labels[j]);cnn.backward(delta);int r = 0, c = 0;output.max_idx(r, c);if (td.i_num == r){++i_correct;}}if ((double)i_correct / i_train_data_set_size > dthreshold){break;}if (i % i_repeat_times == (i_repeat_times - 1)){cnn.update_inert();}if (i % i_show_times == (i_show_times - 1)){std::cout << "train times:" << i << " correct rate:" << (double)i_correct / i_train_data_set_size << std::endl;}}printf("train times:%d correct rate:%f\r\n", i, (double)i_correct / i_train_data_set_size);// 使用训练数据测试printf("---------- test with train data set ----------\r\n");i_correct = 0;for (int j = 0; j < i_train_data_set_size; ++j){auto &td = vec_train_data_set[j];auto output = cnn.forward(*sz_images[j]);int r = 0, c = 0;output.max_idx(r, c);if (td.i_num == r){++i_correct;}if (j < 10){std::cout << "label:" << td.i_num << " output:" << r << std::endl;}}// 随机找10个数据测试printf("---------- test with random data ----------\r\n");std::shuffle(vec_train_data.begin(), vec_train_data.end(), rng);for (int i = 0; i < 10; ++i){auto &td = vec_train_data[i];mat<28, 28, double> mt_image(td.mt_image);auto output = cnn.forward(mt_image/256.0);int r = 0, c = 0;output.max_idx(r, c);std::cout << "label:" << td.i_num << " output:" << r << std::endl;//output.print();}cudaDeviceReset();return 0;
}

这个程序用的是2个卷积池化层加上1个全连接层，使用nadam进行训练加速。卷积池化层使用ReLu作为激活函数，全连接层使用的是softmax激活函数。全连接层使用xavier高斯初始化方法，卷积池化层使用的事he高斯进行初始化权值。
下面看看试验结果：

magic num:2049 | label num:60000
train times:200
train data set size:500
how many times should we update inert? 20
how many times should we show the result? 10
when correct rate reach threshold to stop?:0.99
****** cnn network ******
---------- cnn network
fullcon_output_num: 10
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 16
input_row: 28
input_col: 28
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 1
---------- stack conv-pool layer
tpl_num: 32
input_row: 192
input_col: 12
tpl_row: 5
tpl_col: 5
row_step: 1
col_step: 1
pool_row: 2
pool_col: 2
remain_layer: 0
---------- bp_network ----------
batch_num:1, input_num:12032, output_num:10
###### cnn network ######
train times:9 correct rate:0.986
train times:10 correct rate:0.996000
---------- test with train data set ----------
label:7 output:7
label:3 output:3
label:1 output:1
label:1 output:1
label:2 output:2
label:9 output:9
label:7 output:7
label:4 output:4
label:3 output:3
label:7 output:7
---------- test with random data ----------
label:8 output:8
label:0 output:0
label:1 output:1
label:5 output:5
label:0 output:0
label:0 output:0
label:5 output:5
label:8 output:8
label:0 output:0
label:9 output:9
CUDA error in J:\03_workspace\00_cuda\02_matrix\matrix.hpp at line 279: invalid argument

以上我们输入最大训练200次，训练集大小为500，每20轮训练更新一下nadam的惯性量，每10轮训练打印一下结果，起码达到99%的正确率才能退出。最终我们看到训练了10轮这500个数据的数据集正确率就达到了99.6%。然后分别使用训练集中的数据和随机抽取的数据进行验证，可以看到，随机抽取的10个数据正确率也是100%！
手搓CNN完成。

cuda手搓CNN识别手写数字

相关资讯

热文排行

最新新闻

推荐新闻

热搜词