Inception v4

  • 对 v3 进一步加深和优化,提高性能。
  • v3 中不同 Inception 模块的超参数(卷积层输出通道)都不同。在 v4 中对这一点做了改进,对每种尺寸的特征图采用统一的超参数。

总体结构图

graph TD; image["Image"]; stem["Stem"]; inception_a["4 x Inception-A"]; reduction_a["Reduction-A"]; inception_b["7 x Inception-B"]; reduction_b["Reduction-B"]; inception_c["3 x Inception-C"]; pool["AvgPool k=8"]; dropout["Dropout 0.2"]; classifier["softmax classifier"]; image --> |"3*299*299"| stem; stem --> |"384*35*35"| inception_a; inception_a --> |"384*35*35"| reduction_a; reduction_a --> |"1024*17*17"| inception_b; inception_b --> |"1024*17*17"| reduction_b; reduction_b --> |"1536*8*8"| inception_c; inception_c --> |"1536*8*8"| pool; pool --> |"1536"| dropout; dropout --> |"1536"| classifier;

内部模块

  • Stem
graph TD; input["Input"]; conv1["Conv k=3 s=2 BN ReLU"]; conv2["Conv k=3 BN ReLU"]; conv3["Conv k=3 p=1 BN ReLU"]; conv4["Conv k=3 s=2 BN ReLU"] pool4["MaxPool k=3 p=2"]; concat4["DepthConcat"]; conv5_3x3_a_reduce["Conv k=1 BN ReLU"]; conv5_3x3_a["Conv k=3 BN ReLU"]; conv5_3x3_b_reduce["Conv k=1 BN ReLU"]; conv5_1x7_b["Conv k=[1,7] p=[0,3] BN ReLU"]; conv5_7x1_b["Conv k=[7,1] p=[3,0] BN ReLU"]; conv5_3x3_b["Conv k=3 BN ReLU"]; concat5["DepthConcat"]; conv6["Conv k=3 s=2 BN ReLU"]; pool6["MaxPool k=3 s=2"]; concat6["DepthConcat"]; output["Output"]; input --> |"3*299*299"| conv1; conv1 --> |"32*149*149"| conv2; conv2 --> |"32*147*147"| conv3; conv3 --> |"64*147*147"| conv4; conv3 --> |"64*147*147"| pool4; conv4 --> |"96*73*73"| concat4; pool4 --> |"64*73*73"| concat4; concat4 --> |"160*73*73"| conv5_3x3_a_reduce; conv5_3x3_a_reduce --> |64*73*73| conv5_3x3_a; conv5_3x3_a --> |"96*71*71"| concat5; concat4 --> |"160*73*73"| conv5_3x3_b_reduce; conv5_3x3_b_reduce --> |"64*73*73"| conv5_1x7_b; conv5_1x7_b --> |"64*73*73"| conv5_7x1_b; conv5_7x1_b --> |"64*73*73"| conv5_3x3_b; conv5_3x3_b --> |"96*71*71"| concat5; concat5 --> |"192*71*71"| conv6; concat5 --> |"192*71*71"| pool6; conv6 --> |"192*35*35"| concat6; pool6 --> |"192*35*35"| concat6; concat6 --> |"384*35*35"| output;
  • Inception A
graph TD; base["Input"]; conv1x1["Conv k=1 BN ReLU"]; conv3x3_a_reduce["Conv k=1 BN ReLU"]; conv3x3_a["Conv k=3 p=1 BN ReLU"]; pool3x3["AvgPool k=3 p=1"]; conv3x3_b_reduce["Conv k=1 BN ReLU"]; conv3x3_b1["Conv k=3 p=1 BN ReLU"]; conv3x3_b2["Conv k=3 p=1 BN ReLU"]; conv1x1_pool_proj["Conv k=1 BN ReLU"]; concat["DepthConcat"]; output["Output"]; base --> |"384*35*35"| conv1x1; base --> |"384*35*35"| conv3x3_a_reduce; base --> |"384*35*35"| conv3x3_b_reduce; base --> |"384*35*35"| pool3x3; conv3x3_a_reduce --> |"64*35*35"| conv3x3_a; conv3x3_b_reduce --> |"64*35*35"| conv3x3_b1; conv3x3_b1 --> |"96*35*35"| conv3x3_b2; pool3x3 --> |"384*35*35"| conv1x1_pool_proj; conv1x1 --> |"96*35*35"| concat; conv3x3_a --> |"96*35*35"| concat; conv3x3_b2 --> |"96*35*35"| concat; conv1x1_pool_proj --> |"96*35*35"| concat; concat --> |"384*35*35"| output;
  • Inception B
graph TD; base["Input"]; conv1x1["Conv k=1 BN ReLU"]; convnxn_a_reduce["Conv k=1 BN ReLU"]; convnxn_a_1xn["Conv k=[1,7] p=[0,3] BN ReLU"]; convnxn_a_nx1["Conv k=[7,1] p=[3,0] BN ReLU"]; convnxn_b_reduce["Conv k=1 BN ReLU"]; convnxn_b1_1xn["Conv k=[1,7] p=[0,3] BN ReLU"]; convnxn_b1_nx1["Conv k=[7,1] p=[3,0] BN ReLU"]; convnxn_b2_1xn["Conv k=[1,7] p=[0,3] BN ReLU"]; convnxn_b2_nx1["Conv k=[7,1] p=[3,0] BN ReLU"]; pool3x3["AvgPool k=3 p=1"]; conv1x1_pool_proj["Conv k=1 BN ReLU"]; concat["DepthConcat"]; output["Output"]; base --> |"1024*17*17"| conv1x1; base --> |"1024*17*17"| convnxn_a_reduce; base --> |"1024*17*17"| convnxn_b_reduce; base --> |"1024*17*17"| pool3x3; convnxn_a_reduce --> |"192*17*17"| convnxn_a_1xn; convnxn_a_1xn --> |"224*17*17"| convnxn_a_nx1; convnxn_b_reduce --> |"192*17*17"| convnxn_b1_1xn; convnxn_b1_1xn --> |"192*17*17"| convnxn_b1_nx1; convnxn_b1_nx1 --> |"224*17*17"| convnxn_b2_1xn; convnxn_b2_1xn --> |"224*17*17"| convnxn_b2_nx1; pool3x3 --> |"1024*17*17"| conv1x1_pool_proj; conv1x1 --> |"384*17*17"| concat; convnxn_a_nx1 --> |"256*17*17"| concat; convnxn_b2_nx1 --> |"256*17*17"| concat; conv1x1_pool_proj --> |"128*17*17"| concat; concat --> |"1024*17*17"| output;
  • Inception C
graph TD; base["Input"]; conv1x1["Conv k=1 BN ReLU"]; conv3x3_a_reduce["Conv k=1 BN ReLU"]; conv3x3_a_1x3["Conv k=[1,3] p=[0,1] BN ReLU"]; conv3x3_a_3x1["Conv k=[3,1] p=[1,0] BN ReLU"]; conv3x3_b_reduce["Conv k=1 BN ReLU"]; conv3x3_b1_1x3["Conv k=[1,3] p=[0,1] BN ReLU"]; conv3x3_b1_3x1["Conv k=[3,1] p=[1,0] BN ReLU"]; conv3x3_b2_1x3["Conv k=[1,3] p=[0,1] BN ReLU"]; conv3x3_b2_3x1["Conv k=[3,1] p=[1,0] BN ReLU"]; pool3x3["AvgPool k=3 p=1"]; conv1x1_pool_proj["Conv k=1 BN ReLU"]; concat["DepthConcat"]; output["Output"]; base --> |"1536*8*8"| conv1x1; base --> |"1536*8*8"| conv3x3_a_reduce; base --> |"1536*8*8"| conv3x3_b_reduce; base --> |"1536*8*8"| pool3x3; conv3x3_a_reduce --> |"384*8*8"| conv3x3_a_1x3; conv3x3_a_reduce --> |"384*8*8"| conv3x3_a_3x1; conv3x3_b_reduce --> |"384*8*8"| conv3x3_b1_1x3; conv3x3_b1_1x3 --> |"448*8*8"| conv3x3_b1_3x1; conv3x3_b1_3x1 --> |"512*8*8"| conv3x3_b2_1x3; conv3x3_b1_3x1 --> |"512*8*8"| conv3x3_b2_3x1; pool3x3 --> |"1536*8*8"| conv1x1_pool_proj; conv1x1 --> |"256*8*8"| concat; conv3x3_a_1x3 --> |"256*8*8"| concat; conv3x3_a_3x1 --> |"256*8*8"| concat; conv3x3_b2_1x3 --> |"256*8*8"| concat; conv3x3_b2_3x1 --> |"256*8*8"| concat; conv1x1_pool_proj --> |"256*8*8"| concat; concat --> |"1536*8*8"| output;
  • Reduction A
graph TD; input["Input"]; conv_3x3_a["Conv k=3 s=2 BN ReLU"]; conv_3x3_b_reduce["Conv k=1 BN ReLU"]; conv_3x3_b1["Conv k=3 p=1 BN ReLU"]; conv_3x3_b2["Conv k=3 s=2 BN ReLU"]; pool["MaxPool k=3 s=2"]; concat["DepthConcat"]; output["Output"]; input --> |"384*35*35"| conv_3x3_a; input --> |"384*35*35"| conv_3x3_b_reduce; input --> |"384*35*35"| pool; pool --> |"384*17*17"| concat; conv_3x3_a --> |"384*17*17"| concat; conv_3x3_b_reduce --> |"192*35*35"| conv_3x3_b1; conv_3x3_b1 --> |"224*35*35"| conv_3x3_b2; conv_3x3_b2 --> |"256*17*17"| concat; concat --> |"1024*17*17"| output;
  • Reduction B
graph TD; input["Input"]; conv_3x3_a_reduce["Conv k=1 BN ReLU"]; conv_3x3_a["Conv k=3 s=2 BN ReLU"]; conv_3x3_b_reduce["Conv k=1 BN ReLU"]; conv_1x7_b["Conv k=[1,7] p=[0,3] BN ReLU"]; conv_7x1_b["Conv k=[7,1] p=[3,0] BN ReLU"]; conv_3x3_b["Conv k=3 s=3 BN ReLU"]; pool["MaxPool k=3 s=2"]; concat["DepthConcat"]; output["Output"]; input --> |"1024*17*17"| conv_3x3_a_reduce; conv_3x3_a_reduce --> |"192*17*17"| conv_3x3_a; conv_3x3_a --> |"192*8*8"| concat; input --> |"1024*17*17"| conv_3x3_b_reduce; conv_3x3_b_reduce --> |"256*17*17"| conv_1x7_b; conv_1x7_b --> |"256*17*17"| conv_7x1_b; conv_7x1_b --> |"320*17*17"| conv_3x3_b; conv_3x3_b --> |"320*8*8"| concat; input --> |"1024*17*17"| pool; pool --> |"1024*8*8"| concat; concat --> |"1536*8*8"| output;

实验结果

  • ILSVRC 2012 上 144 切割的实验结果
Network Crops Top-1 Error Top-5 Error
ResNet-151 dense 19.4% 4.5%
Inception-v3 144 18.9% 4.3%
Inception-v4 144 17.7% 3.8%

参考文献

  1. Szegedy, C., Io e, S., Vanhoucke, V.: Inception-v4, inception-resnet and the impact of residual connections on learning. arXiv:1602.07261 (2016)