Commit 5848f290 authored by zhengyaoqiu's avatar zhengyaoqiu

新增图片裁剪

parent cfc2f6c1
......@@ -43,10 +43,12 @@ def search():
image = request.args.get("image")
top_k = request.args.get("top_k", type=int)
bucket = request.args.get("bucket")
part = request.args.get("part")
milvus = MilvusClient().connect()
result = ImageSearch(get_feature_extractor(), milvus).image_to_image_search(bucket, image, top_k)
result = ImageSearch(get_feature_extractor(), milvus).image_to_image_search(bucket, image, part, top_k)
return jsonify({
'code': 0,
......
**/__pycache__
data/
log/
pretrain_model/
MIT License
Copyright (c) 2020 Peike Li
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Self Correction for Human Parsing
![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg)
[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
An out-of-box human parsing representation extractor.
Our solution ranks 1st for all human parsing tracks (including single, multiple and video) in the third LIP challenge!
![lip-visualization](./demo/lip-visualization.jpg)
Features:
- [x] Out-of-box human parsing extractor for other downstream applications.
- [x] Pretrained model on three popular single person human parsing datasets.
- [x] Training and inferecne code.
- [x] Simple yet effective extension on multi-person and video human parsing tasks.
## Requirements
```
conda env create -f environment.yaml
conda activate schp
pip install -r requirements.txt
```
## Simple Out-of-Box Extractor
The easiest way to get started is to use our trained SCHP models on your own images to extract human parsing representations. Here we provided state-of-the-art [trained models](https://drive.google.com/drive/folders/1uOaQCpNtosIjEL2phQKEdiYd0Td18jNo?usp=sharing) on three popular datasets. Theses three datasets have different label system, you can choose the best one to fit on your own task.
**LIP** ([exp-schp-201908261155-lip.pth](https://drive.google.com/file/d/1k4dllHpu0bdx38J7H28rVVLpU-kOHmnH/view?usp=sharing))
* mIoU on LIP validation: **59.36 %**.
* LIP is the largest single person human parsing dataset with 50000+ images. This dataset focus more on the complicated real scenarios. LIP has 20 labels, including 'Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe'.
**ATR** ([exp-schp-201908301523-atr.pth](https://drive.google.com/file/d/1ruJg4lqR_jgQPj-9K0PP-L2vJERYOxLP/view?usp=sharing))
* mIoU on ATR test: **82.29%**.
* ATR is a large single person human parsing dataset with 17000+ images. This dataset focus more on fashion AI. ATR has 18 labels, including 'Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf'.
**Pascal-Person-Part** ([exp-schp-201908270938-pascal-person-part.pth](https://drive.google.com/file/d/1E5YwNKW2VOEayK9mWCS3Kpsxf-3z04ZE/view?usp=sharing))
* mIoU on Pascal-Person-Part validation: **71.46** %.
* Pascal Person Part is a tiny single person human parsing dataset with 3000+ images. This dataset focus more on body parts segmentation. Pascal Person Part has 7 labels, including 'Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'.
Choose one and have fun on your own task!
To extract the human parsing representation, simply put your own image in the `INPUT_PATH` folder, then download a pretrained model and run the following command. The output images with the same file name will be saved in `OUTPUT_PATH`
```
python simple_extractor.py --dataset [DATASET] --model-restore [CHECKPOINT_PATH] --input-dir [INPUT_PATH] --output-dir [OUTPUT_PATH]
```
**[Updated]** Here is also a [colab demo example](https://colab.research.google.com/drive/1JOwOPaChoc9GzyBi5FUEYTSaP2qxJl10?usp=sharing) for quick inference provided by [@levindabhi](https://github.com/levindabhi).
The `DATASET` command has three options, including 'lip', 'atr' and 'pascal'. Note each pixel in the output images denotes the predicted label number. The output images have the same size as the input ones. To better visualization, we put a palette with the output images. We suggest you to read the image with `PIL`.
If you need not only the final parsing images, but also the feature map representations. Add `--logits` command to save the output feature maps. These feature maps are the logits before softmax layer.
## Dataset Preparation
Please download the [LIP](http://sysu-hcp.net/lip/) dataset following the below structure.
```commandline
data/LIP
|--- train_imgaes # 30462 training single person images
|--- val_images # 10000 validation single person images
|--- train_segmentations # 30462 training annotations
|--- val_segmentations # 10000 training annotations
|--- train_id.txt # training image list
|--- val_id.txt # validation image list
```
## Training
```
python train.py
```
By default, the trained model will be saved in `./log` directory. Please read the arguments for more details.
## Evaluation
```
python evaluate.py --model-restore [CHECKPOINT_PATH]
```
CHECKPOINT_PATH should be the path of trained model.
## Extension on Multiple Human Parsing
Please read [MultipleHumanParsing.md](./mhp_extension/README.md) for more details.
## Citation
Please cite our work if you find this repo useful in your research.
```latex
@article{li2020self,
title={Self-Correction for Human Parsing},
author={Li, Peike and Xu, Yunqiu and Wei, Yunchao and Yang, Yi},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
year={2020},
doi={10.1109/TPAMI.2020.3048039}}
```
## Visualization
* Source Image.
![demo](./demo/demo.jpg)
* LIP Parsing Result.
![demo-lip](./demo/demo_lip.png)
* ATR Parsing Result.
![demo-atr](./demo/demo_atr.png)
* Pascal-Person-Part Parsing Result.
![demo-pascal](./demo/demo_pascal.png)
* Source Image.
![demo](./mhp_extension/demo/demo.jpg)
* Instance Human Mask.
![demo-lip](./mhp_extension/demo/demo_instance_human_mask.png)
* Global Human Parsing Result.
![demo-lip](./mhp_extension/demo/demo_global_human_parsing.png)
* Multiple Human Parsing Result.
![demo-lip](./mhp_extension/demo/demo_multiple_human_parsing.png)
## Related
Our code adopts the [InplaceSyncBN](https://github.com/mapillary/inplace_abn) to save gpu memory cost.
There is also a [PaddlePaddle](https://github.com/PaddlePaddle/PaddleSeg/tree/develop/contrib/ACE2P) Implementation of this project.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : datasets.py
@Time : 8/4/19 3:35 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import numpy as np
import random
import torch
import cv2
from torch.utils import data
from utils.transforms import get_affine_transform
class LIPDataSet(data.Dataset):
def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
rotation_factor=30, ignore_label=255, transform=None):
self.root = root
self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
self.crop_size = np.asarray(crop_size)
self.ignore_label = ignore_label
self.scale_factor = scale_factor
self.rotation_factor = rotation_factor
self.flip_prob = 0.5
self.transform = transform
self.dataset = dataset
list_path = os.path.join(self.root, self.dataset + '_id.txt')
train_list = [i_id.strip() for i_id in open(list_path)]
self.train_list = train_list
self.number_samples = len(self.train_list)
def __len__(self):
return self.number_samples
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
return center, scale
def __getitem__(self, index):
train_item = self.train_list[index]
im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
im = cv2.imread(im_path, cv2.IMREAD_COLOR)
h, w, _ = im.shape
parsing_anno = np.zeros((h, w), dtype=np.long)
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
if self.dataset != 'test':
# Get pose annotation
parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
if self.dataset == 'train' or self.dataset == 'trainval':
sf = self.scale_factor
rf = self.rotation_factor
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
if random.random() <= self.flip_prob:
im = im[:, ::-1, :]
parsing_anno = parsing_anno[:, ::-1]
person_center[0] = im.shape[1] - person_center[0] - 1
right_idx = [15, 17, 19]
left_idx = [14, 16, 18]
for i in range(0, 3):
right_pos = np.where(parsing_anno == right_idx[i])
left_pos = np.where(parsing_anno == left_idx[i])
parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
trans = get_affine_transform(person_center, s, r, self.crop_size)
input = cv2.warpAffine(
im,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
if self.transform:
input = self.transform(input)
meta = {
'name': train_item,
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
if self.dataset == 'val' or self.dataset == 'test':
return input, meta
else:
label_parsing = cv2.warpAffine(
parsing_anno,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(255))
label_parsing = torch.from_numpy(label_parsing)
return input, label_parsing, meta
class LIPDataValSet(data.Dataset):
def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
self.root = root
self.crop_size = crop_size
self.transform = transform
self.flip = flip
self.dataset = dataset
self.root = root
self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
self.crop_size = np.asarray(crop_size)
list_path = os.path.join(self.root, self.dataset + '_id.txt')
val_list = [i_id.strip() for i_id in open(list_path)]
self.val_list = val_list
self.number_samples = len(self.val_list)
def __len__(self):
return len(self.val_list)
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
return center, scale
def __getitem__(self, index):
val_item = self.val_list[index]
# Load training image
im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
im = cv2.imread(im_path, cv2.IMREAD_COLOR)
h, w, _ = im.shape
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
trans = get_affine_transform(person_center, s, r, self.crop_size)
input = cv2.warpAffine(
im,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
input = self.transform(input)
flip_input = input.flip(dims=[-1])
if self.flip:
batch_input_im = torch.stack([input, flip_input])
else:
batch_input_im = input
meta = {
'name': val_item,
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
return batch_input_im, meta
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : dataset.py
@Time : 8/30/19 9:12 PM
@Desc : Dataset Definition
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import cv2
import numpy as np
from torch.utils import data
from utils.transforms import get_affine_transform
class SimpleFolderDataset(data.Dataset):
def __init__(self, root, input_size=[512, 512], transform=None):
self.root = root
self.input_size = input_size
self.transform = transform
self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
self.input_size = np.asarray(input_size)
self.file_list = os.listdir(self.root)
def __len__(self):
return len(self.file_list)
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w, h], dtype=np.float32)
return center, scale
def __getitem__(self, index):
img_name = self.file_list[index]
img_path = os.path.join(self.root, img_name)
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
h, w, _ = img.shape
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
trans = get_affine_transform(person_center, s, r, self.input_size)
input = cv2.warpAffine(
img,
trans,
(int(self.input_size[1]), int(self.input_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
input = self.transform(input)
meta = {
'name': img_name,
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
return input, meta
import torch
from torch.nn import functional as F
def generate_edge_tensor(label, edge_width=3):
label = label.type(torch.cuda.FloatTensor)
if len(label.shape) == 2:
label = label.unsqueeze(0)
n, h, w = label.shape
edge = torch.zeros(label.shape, dtype=torch.float).cuda()
# right
edge_right = edge[:, 1:h, :]
edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
& (label[:, :h - 1, :] != 255)] = 1
# up
edge_up = edge[:, :, :w - 1]
edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
& (label[:, :, :w - 1] != 255)
& (label[:, :, 1:w] != 255)] = 1
# upright
edge_upright = edge[:, :h - 1, :w - 1]
edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
& (label[:, :h - 1, :w - 1] != 255)
& (label[:, 1:h, 1:w] != 255)] = 1
# bottomright
edge_bottomright = edge[:, :h - 1, 1:w]
edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
& (label[:, :h - 1, 1:w] != 255)
& (label[:, 1:h, :w - 1] != 255)] = 1
kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
with torch.no_grad():
edge = edge.unsqueeze(1)
edge = F.conv2d(edge, kernel, stride=1, padding=1)
edge[edge!=0] = 1
edge = edge.squeeze()
return edge
name: schp
channels:
- pytorch
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- blas=1.0=mkl
- ca-certificates=2020.12.8=h06a4308_0
- certifi=2020.12.5=py38h06a4308_0
- cudatoolkit=10.1.243=h6bb024c_0
- freetype=2.10.4=h5ab3b9f_0
- intel-openmp=2020.2=254
- jpeg=9b=h024ee3a_2
- lcms2=2.11=h396b838_0
- ld_impl_linux-64=2.33.1=h53a641e_7
- libedit=3.1.20191231=h14c3975_1
- libffi=3.3=he6710b0_2
- libgcc-ng=9.1.0=hdf63c60_0
- libpng=1.6.37=hbc83047_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- libtiff=4.1.0=h2733197_1
- lz4-c=1.9.2=heb0550a_3
- mkl=2020.2=256
- mkl-service=2.3.0=py38he904b0f_0
- mkl_fft=1.2.0=py38h23d657b_0
- mkl_random=1.1.1=py38h0573a6f_0
- ncurses=6.2=he6710b0_1
- ninja=1.10.2=py38hff7bd54_0
- numpy=1.19.2=py38h54aff64_0
- numpy-base=1.19.2=py38hfa32c7d_0
- olefile=0.46=py_0
- openssl=1.1.1i=h27cfd23_0
- pillow=8.0.1=py38he98fc37_0
- pip=20.3.3=py38h06a4308_0
- python=3.8.5=h7579374_1
- readline=8.0=h7b6447c_0
- setuptools=51.0.0=py38h06a4308_2
- six=1.15.0=py38h06a4308_0
- sqlite=3.33.0=h62c20be_0
- tk=8.6.10=hbc83047_0
- tqdm=4.55.0=pyhd3eb1b0_0
- wheel=0.36.2=pyhd3eb1b0_0
- xz=5.2.5=h7b6447c_0
- zlib=1.2.11=h7b6447c_3
- zstd=1.4.5=h9ceee32_0
- pytorch=1.5.1=py3.8_cuda10.1.243_cudnn7.6.3_0
- torchvision=0.6.1=py38_cu101
prefix: /home/peike/opt/anaconda3/envs/schp
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : evaluate.py
@Time : 8/4/19 3:36 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import argparse
import numpy as np
import torch
from torch.utils import data
from tqdm import tqdm
from PIL import Image as PILImage
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn
import networks
from datasets.datasets import LIPDataValSet
from utils.miou import compute_mean_ioU
from utils.transforms import BGR2RGB_transform
from utils.transforms import transform_parsing
def get_arguments():
"""Parse all the arguments provided from the CLI.
Returns:
A list of parsed arguments.
"""
parser = argparse.ArgumentParser(description="Self Correction for Human Parsing")
# Network Structure
parser.add_argument("--arch", type=str, default='resnet101')
# Data Preference
parser.add_argument("--data-dir", type=str, default='./data/LIP')
parser.add_argument("--batch-size", type=int, default=1)
parser.add_argument("--input-size", type=str, default='473,473')
parser.add_argument("--num-classes", type=int, default=20)
parser.add_argument("--ignore-label", type=int, default=255)
parser.add_argument("--random-mirror", action="store_true")
parser.add_argument("--random-scale", action="store_true")
# Evaluation Preference
parser.add_argument("--log-dir", type=str, default='./log')
parser.add_argument("--model-restore", type=str, default='./log/checkpoint.pth.tar')
parser.add_argument("--gpu", type=str, default='0', help="choose gpu device.")
parser.add_argument("--save-results", action="store_true", help="whether to save the results.")
parser.add_argument("--flip", action="store_true", help="random flip during the test.")
parser.add_argument("--multi-scales", type=str, default='1', help="multiple scales during the test")
return parser.parse_args()
def get_palette(num_cls):
""" Returns the color map for visualizing the segmentation mask.
Args:
num_cls: Number of classes
Returns:
The color map
"""
n = num_cls
palette = [0] * (n * 3)
for j in range(0, n):
lab = j
palette[j * 3 + 0] = 0
palette[j * 3 + 1] = 0
palette[j * 3 + 2] = 0
i = 0
while lab:
palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
i += 1
lab >>= 3
return palette
def multi_scale_testing(model, batch_input_im, crop_size=[473, 473], flip=True, multi_scales=[1]):
flipped_idx = (15, 14, 17, 16, 19, 18)
if len(batch_input_im.shape) > 4:
batch_input_im = batch_input_im.squeeze()
if len(batch_input_im.shape) == 3:
batch_input_im = batch_input_im.unsqueeze(0)
interp = torch.nn.Upsample(size=crop_size, mode='bilinear', align_corners=True)
ms_outputs = []
for s in multi_scales:
interp_im = torch.nn.Upsample(scale_factor=s, mode='bilinear', align_corners=True)
scaled_im = interp_im(batch_input_im)
parsing_output = model(scaled_im)
parsing_output = parsing_output[0][-1]
output = parsing_output[0]
if flip:
flipped_output = parsing_output[1]
flipped_output[14:20, :, :] = flipped_output[flipped_idx, :, :]
output += flipped_output.flip(dims=[-1])
output *= 0.5
output = interp(output.unsqueeze(0))
ms_outputs.append(output[0])
ms_fused_parsing_output = torch.stack(ms_outputs)
ms_fused_parsing_output = ms_fused_parsing_output.mean(0)
ms_fused_parsing_output = ms_fused_parsing_output.permute(1, 2, 0) # HWC
parsing = torch.argmax(ms_fused_parsing_output, dim=2)
parsing = parsing.data.cpu().numpy()
ms_fused_parsing_output = ms_fused_parsing_output.data.cpu().numpy()
return parsing, ms_fused_parsing_output
def main():
"""Create the model and start the evaluation process."""
args = get_arguments()
multi_scales = [float(i) for i in args.multi_scales.split(',')]
gpus = [int(i) for i in args.gpu.split(',')]
assert len(gpus) == 1
if not args.gpu == 'None':
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
cudnn.benchmark = True
cudnn.enabled = True
h, w = map(int, args.input_size.split(','))
input_size = [h, w]
model = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=None)
IMAGE_MEAN = model.mean
IMAGE_STD = model.std
INPUT_SPACE = model.input_space
print('image mean: {}'.format(IMAGE_MEAN))
print('image std: {}'.format(IMAGE_STD))
print('input space:{}'.format(INPUT_SPACE))
if INPUT_SPACE == 'BGR':
print('BGR Transformation')
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=IMAGE_MEAN,
std=IMAGE_STD),
])
if INPUT_SPACE == 'RGB':
print('RGB Transformation')
transform = transforms.Compose([
transforms.ToTensor(),
BGR2RGB_transform(),
transforms.Normalize(mean=IMAGE_MEAN,
std=IMAGE_STD),
])
# Data loader
lip_test_dataset = LIPDataValSet(args.data_dir, 'val', crop_size=input_size, transform=transform, flip=args.flip)
num_samples = len(lip_test_dataset)
print('Totoal testing sample numbers: {}'.format(num_samples))
testloader = data.DataLoader(lip_test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True)
# Load model weight
state_dict = torch.load(args.model_restore)['state_dict']
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
model.load_state_dict(new_state_dict)
model.cuda()
model.eval()
sp_results_dir = os.path.join(args.log_dir, 'sp_results')
if not os.path.exists(sp_results_dir):
os.makedirs(sp_results_dir)
palette = get_palette(20)
parsing_preds = []
scales = np.zeros((num_samples, 2), dtype=np.float32)
centers = np.zeros((num_samples, 2), dtype=np.int32)
with torch.no_grad():
for idx, batch in enumerate(tqdm(testloader)):
image, meta = batch
if (len(image.shape) > 4):
image = image.squeeze()
im_name = meta['name'][0]
c = meta['center'].numpy()[0]
s = meta['scale'].numpy()[0]
w = meta['width'].numpy()[0]
h = meta['height'].numpy()[0]
scales[idx, :] = s
centers[idx, :] = c
parsing, logits = multi_scale_testing(model, image.cuda(), crop_size=input_size, flip=args.flip,
multi_scales=multi_scales)
if args.save_results:
parsing_result = transform_parsing(parsing, c, s, w, h, input_size)
parsing_result_path = os.path.join(sp_results_dir, im_name + '.png')
output_im = PILImage.fromarray(np.asarray(parsing_result, dtype=np.uint8))
output_im.putpalette(palette)
output_im.save(parsing_result_path)
parsing_preds.append(parsing)
assert len(parsing_preds) == num_samples
mIoU = compute_mean_ioU(parsing_preds, scales, centers, args.num_classes, args.data_dir, input_size)
print(mIoU)
return
if __name__ == '__main__':
main()
import cv2
import numpy as np
from PIL import Image
def extract_color_region_simple(original_image, mask_image, target_color, margin=10):
"""
简化版本:提取指定颜色区域,白色背景
Args:
original_image: 原始图片 PIL Image 对象
mask_image: 色块图 PIL Image 对象
target_color: 目标颜色 RGB值,例如 [192, 0, 0] (红色)
margin: 裁剪边距,默认 10
Returns:
PIL.Image: 提取的颜色区域图片,失败返回 None
"""
if original_image is None:
print("原始图片为空")
return None
if mask_image is None:
print("掩码图片为空")
return None
# 将PIL Image转换为numpy数组
original_rgb = np.array(original_image.convert('RGB'))
mask_rgb = np.array(mask_image.convert('RGB'))
# 创建目标颜色掩码 - 精确匹配
target_color = np.array(target_color)
color_mask = np.all(mask_rgb == target_color, axis=2).astype(np.uint8) * 255
# 形态学处理
kernel = np.ones((3, 3), np.uint8)
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel)
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel)
# 调整掩码尺寸
if color_mask.shape != original_rgb.shape[:2]:
color_mask = cv2.resize(color_mask, (original_rgb.shape[1], original_rgb.shape[0]))
# 找边界框并裁剪
coords = np.column_stack(np.where(color_mask > 0))
if len(coords) == 0:
print(f"未找到目标颜色区域 RGB{target_color.tolist()}!")
return None
y_min, x_min = coords.min(axis=0)
y_max, x_max = coords.max(axis=0)
# 添加边距
y_min = max(0, y_min - margin)
x_min = max(0, x_min - margin)
y_max = min(original_rgb.shape[0], y_max + margin)
x_max = min(original_rgb.shape[1], x_max + margin)
# 裁剪
cropped_original = original_rgb[y_min:y_max, x_min:x_max]
cropped_mask = color_mask[y_min:y_max, x_min:x_max]
# 创建白色背景结果
result = np.full_like(cropped_original, 255, dtype=np.uint8)
result[cropped_mask > 0] = cropped_original[cropped_mask > 0]
# 统计信息
mask_pixels = np.sum(cropped_mask > 0)
total_pixels = cropped_mask.shape[0] * cropped_mask.shape[1]
print(f"提取完成")
print(f"目标颜色: RGB{target_color.tolist()}")
print(f"裁剪区域尺寸: {result.shape[1]} x {result.shape[0]}")
print(f"目标颜色像素数量: {mask_pixels}")
print(f"占裁剪区域的比例: {mask_pixels / total_pixels * 100:.2f}%")
# 转换为PIL Image并返回
pil_image = Image.fromarray(result)
return pil_image
# 使用示例
if __name__ == "__main__":
# 从文件加载PIL Image
original_image_path = r"D:\work\image_search\img1.jpg"
mask_image_path = r"D:\work\image_search\output.png"
original_image = Image.open(original_image_path)
mask_image = Image.open(mask_image_path)
# 示例: 提取红色区域
print("=== 提取红色区域 ===")
result_image_red = extract_color_region_simple(
original_image,
mask_image,
target_color=[192, 0, 0]
)
if result_image_red:
result_image_red.save(r"D:\work\image_search\extracted_red.png")
print("红色区域提取成功并已保存")
else:
print("红色区域提取失败")
from .bn import ABN, InPlaceABN, InPlaceABNSync
from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
from .misc import GlobalAvgPool2d, SingleGPU
from .residual import IdentityResidualBlock
from .dense import DenseModule
import torch
import torch.nn as nn
import torch.nn.functional as functional
try:
from queue import Queue
except ImportError:
from Queue import Queue
from .functions import *
class ABN(nn.Module):
"""Activated Batch Normalization
This gathers a `BatchNorm2d` and an activation function in a single module
"""
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
"""Creates an Activated Batch Normalization module
Parameters
----------
num_features : int
Number of feature channels in the input and output.
eps : float
Small constant to prevent numerical issues.
momentum : float
Momentum factor applied to compute running statistics as.
affine : bool
If `True` apply learned scale and shift transformation after normalization.
activation : str
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
slope : float
Negative slope for the `leaky_relu` activation.
"""
super(ABN, self).__init__()
self.num_features = num_features
self.affine = affine
self.eps = eps
self.momentum = momentum
self.activation = activation
self.slope = slope
if self.affine:
self.weight = nn.Parameter(torch.ones(num_features))
self.bias = nn.Parameter(torch.zeros(num_features))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
self.register_buffer('running_mean', torch.zeros(num_features))
self.register_buffer('running_var', torch.ones(num_features))
self.reset_parameters()
def reset_parameters(self):
nn.init.constant_(self.running_mean, 0)
nn.init.constant_(self.running_var, 1)
if self.affine:
nn.init.constant_(self.weight, 1)
nn.init.constant_(self.bias, 0)
def forward(self, x):
x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
self.training, self.momentum, self.eps)
if self.activation == ACT_RELU:
return functional.relu(x, inplace=True)
elif self.activation == ACT_LEAKY_RELU:
return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
elif self.activation == ACT_ELU:
return functional.elu(x, inplace=True)
else:
return x
def __repr__(self):
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
' affine={affine}, activation={activation}'
if self.activation == "leaky_relu":
rep += ', slope={slope})'
else:
rep += ')'
return rep.format(name=self.__class__.__name__, **self.__dict__)
class InPlaceABN(ABN):
"""InPlace Activated Batch Normalization"""
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
"""Creates an InPlace Activated Batch Normalization module
Parameters
----------
num_features : int
Number of feature channels in the input and output.
eps : float
Small constant to prevent numerical issues.
momentum : float
Momentum factor applied to compute running statistics as.
affine : bool
If `True` apply learned scale and shift transformation after normalization.
activation : str
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
slope : float
Negative slope for the `leaky_relu` activation.
"""
super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
def forward(self, x):
x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
self.training, self.momentum, self.eps, self.activation, self.slope)
return x
class InPlaceABNSync(ABN):
"""InPlace Activated Batch Normalization with cross-GPU synchronization
This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
"""
def forward(self, x):
x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
self.training, self.momentum, self.eps, self.activation, self.slope)
return x
def __repr__(self):
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
' affine={affine}, activation={activation}'
if self.activation == "leaky_relu":
rep += ', slope={slope})'
else:
rep += ')'
return rep.format(name=self.__class__.__name__, **self.__dict__)
import torch
import torch.nn as nn
import torch.nn.functional as functional
from models._util import try_index
from .bn import ABN
class DeeplabV3(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels=256,
dilations=(12, 24, 36),
norm_act=ABN,
pooling_size=None):
super(DeeplabV3, self).__init__()
self.pooling_size = pooling_size
self.map_convs = nn.ModuleList([
nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
])
self.map_bn = norm_act(hidden_channels * 4)
self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
self.global_pooling_bn = norm_act(hidden_channels)
self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
self.red_bn = norm_act(out_channels)
self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
def reset_parameters(self, activation, slope):
gain = nn.init.calculate_gain(activation, slope)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_normal_(m.weight.data, gain)
if hasattr(m, "bias") and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, ABN):
if hasattr(m, "weight") and m.weight is not None:
nn.init.constant_(m.weight, 1)
if hasattr(m, "bias") and m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
# Map convolutions
out = torch.cat([m(x) for m in self.map_convs], dim=1)
out = self.map_bn(out)
out = self.red_conv(out)
# Global pooling
pool = self._global_pooling(x)
pool = self.global_pooling_conv(pool)
pool = self.global_pooling_bn(pool)
pool = self.pool_red_conv(pool)
if self.training or self.pooling_size is None:
pool = pool.repeat(1, 1, x.size(2), x.size(3))
out += pool
out = self.red_bn(out)
return out
def _global_pooling(self, x):
if self.training or self.pooling_size is None:
pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
pool = pool.view(x.size(0), x.size(1), 1, 1)
else:
pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
min(try_index(self.pooling_size, 1), x.shape[3]))
padding = (
(pooling_size[1] - 1) // 2,
(pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
(pooling_size[0] - 1) // 2,
(pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
)
pool = functional.avg_pool2d(x, pooling_size, stride=1)
pool = functional.pad(pool, pad=padding, mode="replicate")
return pool
from collections import OrderedDict
import torch
import torch.nn as nn
from .bn import ABN
class DenseModule(nn.Module):
def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
super(DenseModule, self).__init__()
self.in_channels = in_channels
self.growth = growth
self.layers = layers
self.convs1 = nn.ModuleList()
self.convs3 = nn.ModuleList()
for i in range(self.layers):
self.convs1.append(nn.Sequential(OrderedDict([
("bn", norm_act(in_channels)),
("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
])))
self.convs3.append(nn.Sequential(OrderedDict([
("bn", norm_act(self.growth * bottleneck_factor)),
("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
dilation=dilation))
])))
in_channels += self.growth
@property
def out_channels(self):
return self.in_channels + self.growth * self.layers
def forward(self, x):
inputs = [x]
for i in range(self.layers):
x = torch.cat(inputs, dim=1)
x = self.convs1[i](x)
x = self.convs3[i](x)
inputs += [x]
return torch.cat(inputs, dim=1)
from os import path
import torch
import torch.distributed as dist
import torch.autograd as autograd
import torch.cuda.comm as comm
from torch.autograd.function import once_differentiable
from torch.utils.cpp_extension import load
_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
# 检查CUDA是否可用
if torch.cuda.is_available():
sources = [path.join(_src_path, f) for f in [
"inplace_abn.cpp",
"inplace_abn_cpu.cpp",
"inplace_abn_cuda.cu",
"inplace_abn_cuda_half.cu"
]]
extra_cuda_cflags = ["--expt-extended-lambda"]
else:
# CPU-only版本
sources = [path.join(_src_path, f) for f in [
"inplace_abn.cpp",
"inplace_abn_cpu.cpp"
]]
extra_cuda_cflags = []
_backend = load(name="inplace_abn",
extra_cflags=["-O3"],
sources=sources,
extra_cuda_cflags=extra_cuda_cflags)
# Activation names
ACT_RELU = "relu"
ACT_LEAKY_RELU = "leaky_relu"
ACT_ELU = "elu"
ACT_NONE = "none"
def _check(fn, *args, **kwargs):
success = fn(*args, **kwargs)
if not success:
raise RuntimeError("CUDA Error encountered in {}".format(fn))
def _broadcast_shape(x):
out_size = []
for i, s in enumerate(x.size()):
if i != 1:
out_size.append(1)
else:
out_size.append(s)
return out_size
def _reduce(x):
if len(x.size()) == 2:
return x.sum(dim=0)
else:
n, c = x.size()[0:2]
return x.contiguous().view((n, c, -1)).sum(2).sum(0)
def _count_samples(x):
count = 1
for i, s in enumerate(x.size()):
if i != 1:
count *= s
return count
def _act_forward(ctx, x):
if ctx.activation == ACT_LEAKY_RELU:
_backend.leaky_relu_forward(x, ctx.slope)
elif ctx.activation == ACT_ELU:
_backend.elu_forward(x)
elif ctx.activation == ACT_NONE:
pass
def _act_backward(ctx, x, dx):
if ctx.activation == ACT_LEAKY_RELU:
_backend.leaky_relu_backward(x, dx, ctx.slope)
elif ctx.activation == ACT_ELU:
_backend.elu_backward(x, dx)
elif ctx.activation == ACT_NONE:
pass
class InPlaceABN(autograd.Function):
@staticmethod
def forward(ctx, x, weight, bias, running_mean, running_var,
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
# Save context
ctx.training = training
ctx.momentum = momentum
ctx.eps = eps
ctx.activation = activation
ctx.slope = slope
ctx.affine = weight is not None and bias is not None
# Prepare inputs
count = _count_samples(x)
x = x.contiguous()
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
if ctx.training:
mean, var = _backend.mean_var(x)
# Update running stats
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
# Mark in-place modified tensors
ctx.mark_dirty(x, running_mean, running_var)
else:
mean, var = running_mean.contiguous(), running_var.contiguous()
ctx.mark_dirty(x)
# BN forward + activation
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
_act_forward(ctx, x)
# Output
ctx.var = var
ctx.save_for_backward(x, var, weight, bias)
ctx.mark_non_differentiable(running_mean, running_var)
return x, running_mean, running_var
@staticmethod
@once_differentiable
def backward(ctx, dz, _drunning_mean, _drunning_var):
z, var, weight, bias = ctx.saved_tensors
dz = dz.contiguous()
# Undo activation
_act_backward(ctx, z, dz)
if ctx.training:
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
else:
# TODO: implement simplified CUDA backward for inference mode
edz = dz.new_zeros(dz.size(1))
eydz = dz.new_zeros(dz.size(1))
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
# dweight = eydz * weight.sign() if ctx.affine else None
dweight = eydz if ctx.affine else None
if dweight is not None:
dweight[weight < 0] *= -1
dbias = edz if ctx.affine else None
return dx, dweight, dbias, None, None, None, None, None, None, None
class InPlaceABNSync(autograd.Function):
@classmethod
def forward(cls, ctx, x, weight, bias, running_mean, running_var,
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
# Save context
ctx.training = training
ctx.momentum = momentum
ctx.eps = eps
ctx.activation = activation
ctx.slope = slope
ctx.affine = weight is not None and bias is not None
# Prepare inputs
ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
# count = _count_samples(x)
batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
x = x.contiguous()
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
if ctx.training:
mean, var = _backend.mean_var(x)
if ctx.world_size > 1:
# get global batch size
if equal_batches:
batch_size *= ctx.world_size
else:
dist.all_reduce(batch_size, dist.ReduceOp.SUM)
ctx.factor = x.shape[0] / float(batch_size.item())
mean_all = mean.clone() * ctx.factor
dist.all_reduce(mean_all, dist.ReduceOp.SUM)
var_all = (var + (mean - mean_all) ** 2) * ctx.factor
dist.all_reduce(var_all, dist.ReduceOp.SUM)
mean = mean_all
var = var_all
# Update running stats
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
# Mark in-place modified tensors
ctx.mark_dirty(x, running_mean, running_var)
else:
mean, var = running_mean.contiguous(), running_var.contiguous()
ctx.mark_dirty(x)
# BN forward + activation
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
_act_forward(ctx, x)
# Output
ctx.var = var
ctx.save_for_backward(x, var, weight, bias)
ctx.mark_non_differentiable(running_mean, running_var)
return x, running_mean, running_var
@staticmethod
@once_differentiable
def backward(ctx, dz, _drunning_mean, _drunning_var):
z, var, weight, bias = ctx.saved_tensors
dz = dz.contiguous()
# Undo activation
_act_backward(ctx, z, dz)
if ctx.training:
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
edz_local = edz.clone()
eydz_local = eydz.clone()
if ctx.world_size > 1:
edz *= ctx.factor
dist.all_reduce(edz, dist.ReduceOp.SUM)
eydz *= ctx.factor
dist.all_reduce(eydz, dist.ReduceOp.SUM)
else:
edz_local = edz = dz.new_zeros(dz.size(1))
eydz_local = eydz = dz.new_zeros(dz.size(1))
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
# dweight = eydz_local * weight.sign() if ctx.affine else None
dweight = eydz_local if ctx.affine else None
if dweight is not None:
dweight[weight < 0] *= -1
dbias = edz_local if ctx.affine else None
return dx, dweight, dbias, None, None, None, None, None, None, None
inplace_abn = InPlaceABN.apply
inplace_abn_sync = InPlaceABNSync.apply
__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
import torch.nn as nn
import torch
import torch.distributed as dist
class GlobalAvgPool2d(nn.Module):
def __init__(self):
"""Global average pooling over the input's spatial dimensions"""
super(GlobalAvgPool2d, self).__init__()
def forward(self, inputs):
in_size = inputs.size()
return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
class SingleGPU(nn.Module):
def __init__(self, module):
super(SingleGPU, self).__init__()
self.module=module
def forward(self, input):
return self.module(input.cuda(non_blocking=True))
from collections import OrderedDict
import torch.nn as nn
from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
import torch.nn.functional as functional
class ResidualBlock(nn.Module):
"""Configurable residual block
Parameters
----------
in_channels : int
Number of input channels.
channels : list of int
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
`3 x 3` then `1 x 1` convolutions.
stride : int
Stride of the first `3 x 3` convolution
dilation : int
Dilation to apply to the `3 x 3` convolutions.
groups : int
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
bottleneck blocks.
norm_act : callable
Function to create normalization / activation Module.
dropout: callable
Function to create Dropout Module.
"""
def __init__(self,
in_channels,
channels,
stride=1,
dilation=1,
groups=1,
norm_act=ABN,
dropout=None):
super(ResidualBlock, self).__init__()
# Check parameters for inconsistencies
if len(channels) != 2 and len(channels) != 3:
raise ValueError("channels must contain either two or three values")
if len(channels) == 2 and groups != 1:
raise ValueError("groups > 1 are only valid if len(channels) == 3")
is_bottleneck = len(channels) == 3
need_proj_conv = stride != 1 or in_channels != channels[-1]
if not is_bottleneck:
bn2 = norm_act(channels[1])
bn2.activation = ACT_NONE
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
dilation=dilation)),
("bn1", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
dilation=dilation)),
("bn2", bn2)
]
if dropout is not None:
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
else:
bn3 = norm_act(channels[2])
bn3.activation = ACT_NONE
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
("bn1", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
groups=groups, dilation=dilation)),
("bn2", norm_act(channels[1])),
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
("bn3", bn3)
]
if dropout is not None:
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
self.convs = nn.Sequential(OrderedDict(layers))
if need_proj_conv:
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
self.proj_bn = norm_act(channels[-1])
self.proj_bn.activation = ACT_NONE
def forward(self, x):
if hasattr(self, "proj_conv"):
residual = self.proj_conv(x)
residual = self.proj_bn(residual)
else:
residual = x
x = self.convs(x) + residual
if self.convs.bn1.activation == ACT_LEAKY_RELU:
return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
elif self.convs.bn1.activation == ACT_ELU:
return functional.elu(x, inplace=True)
else:
return x
class IdentityResidualBlock(nn.Module):
def __init__(self,
in_channels,
channels,
stride=1,
dilation=1,
groups=1,
norm_act=ABN,
dropout=None):
"""Configurable identity-mapping residual block
Parameters
----------
in_channels : int
Number of input channels.
channels : list of int
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
`3 x 3` then `1 x 1` convolutions.
stride : int
Stride of the first `3 x 3` convolution
dilation : int
Dilation to apply to the `3 x 3` convolutions.
groups : int
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
bottleneck blocks.
norm_act : callable
Function to create normalization / activation Module.
dropout: callable
Function to create Dropout Module.
"""
super(IdentityResidualBlock, self).__init__()
# Check parameters for inconsistencies
if len(channels) != 2 and len(channels) != 3:
raise ValueError("channels must contain either two or three values")
if len(channels) == 2 and groups != 1:
raise ValueError("groups > 1 are only valid if len(channels) == 3")
is_bottleneck = len(channels) == 3
need_proj_conv = stride != 1 or in_channels != channels[-1]
self.bn1 = norm_act(in_channels)
if not is_bottleneck:
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
dilation=dilation)),
("bn2", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
dilation=dilation))
]
if dropout is not None:
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
else:
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
("bn2", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
groups=groups, dilation=dilation)),
("bn3", norm_act(channels[1])),
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
]
if dropout is not None:
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
self.convs = nn.Sequential(OrderedDict(layers))
if need_proj_conv:
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
def forward(self, x):
if hasattr(self, "proj_conv"):
bn1 = self.bn1(x)
shortcut = self.proj_conv(bn1)
else:
shortcut = x.clone()
bn1 = self.bn1(x)
out = self.convs(bn1)
out.add_(shortcut)
return out
#pragma once
#include <ATen/ATen.h>
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
#ifndef AT_CHECK
#define AT_CHECK AT_ASSERT
#endif
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
\ No newline at end of file
#include <torch/extension.h>
#include <vector>
#include "inplace_abn.h"
std::vector<at::Tensor> mean_var(at::Tensor x) {
if (x.is_cuda()) {
#ifdef WITH_CUDA
if (x.scalar_type() == at::ScalarType::Half) {
return mean_var_cuda_h(x);
} else {
return mean_var_cuda(x);
}
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return mean_var_cpu(x);
}
}
at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
if (x.is_cuda()) {
#ifdef WITH_CUDA
if (x.scalar_type() == at::ScalarType::Half) {
return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
} else {
return forward_cuda(x, mean, var, weight, bias, affine, eps);
}
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return forward_cpu(x, mean, var, weight, bias, affine, eps);
}
}
std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
if (z.is_cuda()) {
#ifdef WITH_CUDA
if (z.scalar_type() == at::ScalarType::Half) {
return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
} else {
return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
}
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
}
}
at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
if (z.is_cuda()) {
#ifdef WITH_CUDA
if (z.scalar_type() == at::ScalarType::Half) {
return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
} else {
return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
}
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
}
}
void leaky_relu_forward(at::Tensor z, float slope) {
at::leaky_relu_(z, slope);
}
void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
if (z.is_cuda()) {
#ifdef WITH_CUDA
if (z.scalar_type() == at::ScalarType::Half) {
return leaky_relu_backward_cuda_h(z, dz, slope);
} else {
return leaky_relu_backward_cuda(z, dz, slope);
}
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return leaky_relu_backward_cpu(z, dz, slope);
}
}
void elu_forward(at::Tensor z) {
at::elu_(z);
}
void elu_backward(at::Tensor z, at::Tensor dz) {
if (z.is_cuda()) {
#ifdef WITH_CUDA
return elu_backward_cuda(z, dz);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return elu_backward_cpu(z, dz);
}
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("mean_var", &mean_var, "Mean and variance computation");
m.def("forward", &forward, "In-place forward computation");
m.def("edz_eydz", &edz_eydz, "First part of backward computation");
m.def("backward", &backward, "Second part of backward computation");
m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
m.def("elu_forward", &elu_forward, "Elu forward computation");
m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
}
#pragma once
#include <ATen/ATen.h>
#include <vector>
std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
void elu_backward_cpu(at::Tensor z, at::Tensor dz);
void elu_backward_cuda(at::Tensor z, at::Tensor dz);
static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
num = x.size(0);
chn = x.size(1);
sp = 1;
for (int64_t i = 2; i < x.ndimension(); ++i)
sp *= x.size(i);
}
/*
* Specialized CUDA reduction functions for BN
*/
#ifdef __CUDACC__
#include "utils/cuda.cuh"
template <typename T, typename Op>
__device__ T reduce(Op op, int plane, int N, int S) {
T sum = (T)0;
for (int batch = 0; batch < N; ++batch) {
for (int x = threadIdx.x; x < S; x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T)0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#endif
#include <ATen/ATen.h>
#include <vector>
#include "utils/checks.h"
#include "inplace_abn.h"
at::Tensor reduce_sum(at::Tensor x) {
if (x.ndimension() == 2) {
return x.sum(0);
} else {
auto x_view = x.view({x.size(0), x.size(1), -1});
return x_view.sum(-1).sum(0);
}
}
at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
if (x.ndimension() == 2) {
return v;
} else {
std::vector<int64_t> broadcast_size = {1, -1};
for (int64_t i = 2; i < x.ndimension(); ++i)
broadcast_size.push_back(1);
return v.view(broadcast_size);
}
}
int64_t count(at::Tensor x) {
int64_t count = x.size(0);
for (int64_t i = 2; i < x.ndimension(); ++i)
count *= x.size(i);
return count;
}
at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
if (affine) {
return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
} else {
return z;
}
}
std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
auto num = count(x);
auto mean = reduce_sum(x) / num;
auto diff = x - broadcast_to(mean, x);
auto var = reduce_sum(diff.pow(2)) / num;
return {mean, var};
}
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
auto mul = at::rsqrt(var + eps) * gamma;
x.sub_(broadcast_to(mean, x));
x.mul_(broadcast_to(mul, x));
if (affine) x.add_(broadcast_to(bias, x));
return x;
}
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
auto edz = reduce_sum(dz);
auto y = invert_affine(z, weight, bias, affine, eps);
auto eydz = reduce_sum(y * dz);
return {edz, eydz};
}
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
auto y = invert_affine(z, weight, bias, affine, eps);
auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
auto num = count(z);
auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
return dx;
}
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CPU_INPUT(z);
CHECK_CPU_INPUT(dz);
AT_DISPATCH_FLOATING_TYPES(z.scalar_type(), "leaky_relu_backward_cpu", ([&] {
int64_t count = z.numel();
auto *_z = z.data<scalar_t>();
auto *_dz = dz.data<scalar_t>();
for (int64_t i = 0; i < count; ++i) {
if (_z[i] < 0) {
_z[i] *= 1 / slope;
_dz[i] *= slope;
}
}
}));
}
void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
CHECK_CPU_INPUT(z);
CHECK_CPU_INPUT(dz);
AT_DISPATCH_FLOATING_TYPES(z.scalar_type(), "elu_backward_cpu", ([&] {
int64_t count = z.numel();
auto *_z = z.data<scalar_t>();
auto *_dz = dz.data<scalar_t>();
for (int64_t i = 0; i < count; ++i) {
if (_z[i] < 0) {
_z[i] = log1p(_z[i]);
_dz[i] *= (_z[i] + 1.f);
}
}
}));
}
#include <ATen/ATen.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <vector>
#include "utils/checks.h"
#include "utils/cuda.cuh"
#include "inplace_abn.h"
#include <ATen/cuda/CUDAContext.h>
// Operations for reduce
template<typename T>
struct SumOp {
__device__ SumOp(const T *t, int c, int s)
: tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
return tensor[(batch * chn + plane) * sp + n];
}
const T *tensor;
const int chn;
const int sp;
};
template<typename T>
struct VarOp {
__device__ VarOp(T m, const T *t, int c, int s)
: mean(m), tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
T val = tensor[(batch * chn + plane) * sp + n];
return (val - mean) * (val - mean);
}
const T mean;
const T *tensor;
const int chn;
const int sp;
};
template<typename T>
struct GradOp {
__device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
__device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
T _dz = dz[(batch * chn + plane) * sp + n];
return Pair<T>(_dz, _y * _dz);
}
const T weight;
const T bias;
const T *z;
const T *dz;
const int chn;
const int sp;
};
/***********
* mean_var
***********/
template<typename T>
__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
int plane = blockIdx.x;
T norm = T(1) / T(num * sp);
T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
__syncthreads();
T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
if (threadIdx.x == 0) {
mean[plane] = _mean;
var[plane] = _var;
}
}
std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
CHECK_CUDA_INPUT(x);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Prepare output tensors
auto mean = at::empty({chn}, x.options());
auto var = at::empty({chn}, x.options());
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
x.data<scalar_t>(),
mean.data<scalar_t>(),
var.data<scalar_t>(),
num, chn, sp);
}));
return {mean, var};
}
/**********
* forward
**********/
template<typename T>
__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _mean = mean[plane];
T _var = var[plane];
T _weight = affine ? abs(weight[plane]) + eps : T(1);
T _bias = affine ? bias[plane] : T(0);
T mul = rsqrt(_var + eps) * _weight;
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
T _x = x[(batch * chn + plane) * sp + n];
T _y = (_x - _mean) * mul + _bias;
x[(batch * chn + plane) * sp + n] = _y;
}
}
}
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(x);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
x.data<scalar_t>(),
mean.data<scalar_t>(),
var.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return x;
}
/***********
* edz_eydz
***********/
template<typename T>
__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
T _bias = affine ? bias[plane] : 0.f;
Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
__syncthreads();
if (threadIdx.x == 0) {
edz[plane] = res.v1;
eydz[plane] = res.v2;
}
}
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto edz = at::empty({chn}, z.options());
auto eydz = at::empty({chn}, z.options());
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
z.data<scalar_t>(),
dz.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
edz.data<scalar_t>(),
eydz.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return {edz, eydz};
}
/***********
* backward
***********/
template<typename T>
__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
T _bias = affine ? bias[plane] : 0.f;
T _var = var[plane];
T _edz = edz[plane];
T _eydz = eydz[plane];
T _mul = _weight * rsqrt(_var + eps);
T count = T(num * sp);
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
T _dz = dz[(batch * chn + plane) * sp + n];
T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
}
}
}
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(edz);
CHECK_CUDA_INPUT(eydz);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto dx = at::zeros_like(z);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
z.data<scalar_t>(),
dz.data<scalar_t>(),
var.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
edz.data<scalar_t>(),
eydz.data<scalar_t>(),
dx.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return dx;
}
/**************
* activations
**************/
template<typename T>
inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
// Create thrust pointers
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
auto stream = at::cuda::getCurrentCUDAStream();
thrust::transform_if(thrust::cuda::par.on(stream),
th_dz, th_dz + count, th_z, th_dz,
[slope] __device__ (const T& dz) { return dz * slope; },
[] __device__ (const T& z) { return z < 0; });
thrust::transform_if(thrust::cuda::par.on(stream),
th_z, th_z + count, th_z,
[slope] __device__ (const T& z) { return z / slope; },
[] __device__ (const T& z) { return z < 0; });
}
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
}));
}
template<typename T>
inline void elu_backward_impl(T *z, T *dz, int64_t count) {
// Create thrust pointers
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
auto stream = at::cuda::getCurrentCUDAStream();
thrust::transform_if(thrust::cuda::par.on(stream),
th_dz, th_dz + count, th_z, th_z, th_dz,
[] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
[] __device__ (const T& z) { return z < 0; });
thrust::transform_if(thrust::cuda::par.on(stream),
th_z, th_z + count, th_z,
[] __device__ (const T& z) { return log1p(z); },
[] __device__ (const T& z) { return z < 0; });
}
void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
}));
}
#include <ATen/ATen.h>
#include <cuda_fp16.h>
#include <vector>
#include "utils/checks.h"
#include "utils/cuda.cuh"
#include "inplace_abn.h"
#include <ATen/cuda/CUDAContext.h>
// Operations for reduce
struct SumOpH {
__device__ SumOpH(const half *t, int c, int s)
: tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
return __half2float(tensor[(batch * chn + plane) * sp + n]);
}
const half *tensor;
const int chn;
const int sp;
};
struct VarOpH {
__device__ VarOpH(float m, const half *t, int c, int s)
: mean(m), tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
return (t - mean) * (t - mean);
}
const float mean;
const half *tensor;
const int chn;
const int sp;
};
struct GradOpH {
__device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
__device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
return Pair<float>(_dz, _y * _dz);
}
const float weight;
const float bias;
const half *z;
const half *dz;
const int chn;
const int sp;
};
/***********
* mean_var
***********/
__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
int plane = blockIdx.x;
float norm = 1.f / static_cast<float>(num * sp);
float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
__syncthreads();
float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
if (threadIdx.x == 0) {
mean[plane] = _mean;
var[plane] = _var;
}
}
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
CHECK_CUDA_INPUT(x);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Prepare output tensors
auto mean = at::empty({chn},x.options().dtype(at::kFloat));
auto var = at::empty({chn},x.options().dtype(at::kFloat));
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(x.data<at::Half>()),
mean.data<float>(),
var.data<float>(),
num, chn, sp);
return {mean, var};
}
/**********
* forward
**********/
__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
const float _mean = mean[plane];
const float _var = var[plane];
const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
const float _bias = affine ? bias[plane] : 0.f;
const float mul = rsqrt(_var + eps) * _weight;
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
half *x_ptr = x + (batch * chn + plane) * sp + n;
float _x = __half2float(*x_ptr);
float _y = (_x - _mean) * mul + _bias;
*x_ptr = __float2half(_y);
}
}
}
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(x);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
forward_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(x.data<at::Half>()),
mean.data<float>(),
var.data<float>(),
weight.data<float>(),
bias.data<float>(),
affine, eps, num, chn, sp);
return x;
}
__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
float _bias = affine ? bias[plane] : 0.f;
Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
__syncthreads();
if (threadIdx.x == 0) {
edz[plane] = res.v1;
eydz[plane] = res.v2;
}
}
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto edz = at::empty({chn},z.options().dtype(at::kFloat));
auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
weight.data<float>(),
bias.data<float>(),
edz.data<float>(),
eydz.data<float>(),
affine, eps, num, chn, sp);
return {edz, eydz};
}
__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
float _bias = affine ? bias[plane] : 0.f;
float _var = var[plane];
float _edz = edz[plane];
float _eydz = eydz[plane];
float _mul = _weight * rsqrt(_var + eps);
float count = float(num * sp);
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
}
}
}
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(edz);
CHECK_CUDA_INPUT(eydz);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto dx = at::zeros_like(z);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
backward_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
var.data<float>(),
weight.data<float>(),
bias.data<float>(),
edz.data<float>(),
eydz.data<float>(),
reinterpret_cast<half*>(dx.data<at::Half>()),
affine, eps, num, chn, sp);
return dx;
}
__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
float _z = __half2float(z[i]);
if (_z < 0) {
dz[i] = __float2half(__half2float(dz[i]) * slope);
z[i] = __float2half(_z / slope);
}
}
}
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
dim3 threads(getNumThreads(count));
dim3 blocks = (count + threads.x - 1) / threads.x;
auto stream = at::cuda::getCurrentCUDAStream();
leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
slope, count);
}
#pragma once
#include <ATen/ATen.h>
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
#ifndef AT_CHECK
#define AT_CHECK AT_ASSERT
#endif
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
\ No newline at end of file
#pragma once
#include <ATen/ATen.h>
/*
* Functions to share code between CPU and GPU
*/
#ifdef __CUDACC__
// CUDA versions
#define HOST_DEVICE __host__ __device__
#define INLINE_HOST_DEVICE __host__ __device__ inline
#define FLOOR(x) floor(x)
#if __CUDA_ARCH__ >= 600
// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
#define ACCUM(x,y) atomicAdd_block(&(x),(y))
#else
// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
// and use the known atomicCAS-based implementation for double
template<typename data_t>
__device__ inline data_t atomic_add(data_t *address, data_t val) {
return atomicAdd(address, val);
}
template<>
__device__ inline double atomic_add(double *address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#define ACCUM(x,y) atomic_add(&(x),(y))
#endif // #if __CUDA_ARCH__ >= 600
#else
// CPU versions
#define HOST_DEVICE
#define INLINE_HOST_DEVICE inline
#define FLOOR(x) std::floor(x)
#define ACCUM(x,y) (x) += (y)
#endif // #ifdef __CUDACC__
\ No newline at end of file
#pragma once
/*
* General settings and functions
*/
const int WARP_SIZE = 32;
const int MAX_BLOCK_SIZE = 1024;
static int getNumThreads(int nElem) {
int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
for (int i = 0; i < 6; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
/*
* Reduction utilities
*/
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
unsigned int mask = 0xffffffff) {
#if CUDART_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
template<typename T>
struct Pair {
T v1, v2;
__device__ Pair() {}
__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
__device__ Pair(T v) : v1(v), v2(v) {}
__device__ Pair(int v) : v1(v), v2(v) {}
__device__ Pair &operator+=(const Pair<T> &a) {
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
template<typename T>
static __device__ __forceinline__ T warpSum(T val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
}
#else
__shared__ T values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
return val;
}
template<typename T>
static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
value.v1 = warpSum(value.v1);
value.v2 = warpSum(value.v2);
return value;
}
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : AugmentCE2P.py
@Time : 8/4/19 3:35 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import functools
import torch
import torch.nn as nn
from torch.nn import functional as F
from app.models.schp.modules import InPlaceABNSync
# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
affine_par = True
pretrained_settings = {
'resnet101': {
'imagenet': {
'input_space': 'BGR',
'input_size': [3, 224, 224],
'input_range': [0, 1],
'mean': [0.406, 0.456, 0.485],
'std': [0.225, 0.224, 0.229],
'num_classes': 1000
}
},
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
self.bn2 = BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=False)
self.relu_inplace = nn.ReLU(inplace=True)
self.downsample = downsample
self.dilation = dilation
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out = out + residual
out = self.relu_inplace(out)
return out
class PSPModule(nn.Module):
"""
Reference:
Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
"""
def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
super(PSPModule, self).__init__()
self.stages = []
self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
self.bottleneck = nn.Sequential(
nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
bias=False),
InPlaceABNSync(out_features),
)
def _make_stage(self, features, out_features, size):
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
bn = InPlaceABNSync(out_features)
return nn.Sequential(prior, conv, bn)
def forward(self, feats):
h, w = feats.size(2), feats.size(3)
priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
self.stages] + [feats]
bottle = self.bottleneck(torch.cat(priors, 1))
return bottle
class ASPPModule(nn.Module):
"""
Reference:
Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
"""
def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
bias=False),
InPlaceABNSync(inner_features))
self.conv2 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(inner_features))
self.conv3 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
InPlaceABNSync(inner_features))
self.conv4 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
InPlaceABNSync(inner_features))
self.conv5 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
InPlaceABNSync(inner_features))
self.bottleneck = nn.Sequential(
nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(out_features),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
bottle = self.bottleneck(out)
return bottle
class Edge_Module(nn.Module):
"""
Edge Learning Branch
"""
def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
super(Edge_Module, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(mid_fea)
)
self.conv2 = nn.Sequential(
nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(mid_fea)
)
self.conv3 = nn.Sequential(
nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(mid_fea)
)
self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
def forward(self, x1, x2, x3):
_, _, h, w = x1.size()
edge1_fea = self.conv1(x1)
edge1 = self.conv4(edge1_fea)
edge2_fea = self.conv2(x2)
edge2 = self.conv4(edge2_fea)
edge3_fea = self.conv3(x3)
edge3 = self.conv4(edge3_fea)
edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
edge = torch.cat([edge1, edge2, edge3], dim=1)
edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
edge = self.conv5(edge)
return edge, edge_fea
class Decoder_Module(nn.Module):
"""
Parsing Branch Decoder Module.
"""
def __init__(self, num_classes):
super(Decoder_Module, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(256)
)
self.conv2 = nn.Sequential(
nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(48)
)
self.conv3 = nn.Sequential(
nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(256),
nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(256)
)
self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
def forward(self, xt, xl):
_, _, h, w = xl.size()
xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
xl = self.conv2(xl)
x = torch.cat([xt, xl], dim=1)
x = self.conv3(x)
seg = self.conv4(x)
return seg, x
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes):
self.inplanes = 128
super(ResNet, self).__init__()
self.conv1 = conv3x3(3, 64, stride=2)
self.bn1 = BatchNorm2d(64)
self.relu1 = nn.ReLU(inplace=False)
self.conv2 = conv3x3(64, 64)
self.bn2 = BatchNorm2d(64)
self.relu2 = nn.ReLU(inplace=False)
self.conv3 = conv3x3(64, 128)
self.bn3 = BatchNorm2d(128)
self.relu3 = nn.ReLU(inplace=False)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
self.context_encoding = PSPModule(2048, 512)
self.edge = Edge_Module()
self.decoder = Decoder_Module(num_classes)
self.fushion = nn.Sequential(
nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(256),
nn.Dropout2d(0.1),
nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
)
def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
BatchNorm2d(planes * block.expansion, affine=affine_par))
layers = []
generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
multi_grid=generate_multi_grid(0, multi_grid)))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(
block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
return nn.Sequential(*layers)
def forward(self, x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x2 = self.layer1(x)
x3 = self.layer2(x2)
x4 = self.layer3(x3)
x5 = self.layer4(x4)
x = self.context_encoding(x5)
parsing_result, parsing_fea = self.decoder(x, x2)
# Edge Branch
edge_result, edge_fea = self.edge(x2, x3, x4)
# Fusion Branch
x = torch.cat([parsing_fea, edge_fea], dim=1)
fusion_result = self.fushion(x)
return [[parsing_result, fusion_result], [edge_result]]
def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
model.input_space = settings['input_space']
model.input_size = settings['input_size']
model.input_range = settings['input_range']
model.mean = settings['mean']
model.std = settings['std']
if pretrained is not None:
saved_state_dict = torch.load(pretrained)
new_params = model.state_dict().copy()
for i in saved_state_dict:
i_parts = i.split('.')
if not i_parts[0] == 'fc':
new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
model.load_state_dict(new_params)
def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
settings = pretrained_settings['resnet101']['imagenet']
initialize_pretrained_model(model, settings, pretrained)
return model
from __future__ import absolute_import
from app.models.schp.networks.AugmentCE2P import resnet101
__factory = {
'resnet101': resnet101,
}
def init_model(name, *args, **kwargs):
if name not in __factory.keys():
raise KeyError("Unknown model arch: {}".format(name))
return __factory[name](*args, **kwargs)
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : mobilenetv2.py
@Time : 8/4/19 3:35 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch.nn as nn
import math
import functools
from modules import InPlaceABN, InPlaceABNSync
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
__all__ = ['mobilenetv2']
def conv_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
BatchNorm2d(oup),
nn.ReLU6(inplace=True)
)
def conv_1x1_bn(inp, oup):
return nn.Sequential(
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
nn.ReLU6(inplace=True)
)
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
hidden_dim = round(inp * expand_ratio)
self.use_res_connect = self.stride == 1 and inp == oup
if expand_ratio == 1:
self.conv = nn.Sequential(
# dw
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
)
else:
self.conv = nn.Sequential(
# pw
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# dw
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(nn.Module):
def __init__(self, n_class=1000, input_size=224, width_mult=1.):
super(MobileNetV2, self).__init__()
block = InvertedResidual
input_channel = 32
last_channel = 1280
interverted_residual_setting = [
# t, c, n, s
[1, 16, 1, 1],
[6, 24, 2, 2], # layer 2
[6, 32, 3, 2], # layer 3
[6, 64, 4, 2],
[6, 96, 3, 1], # layer 4
[6, 160, 3, 2],
[6, 320, 1, 1], # layer 5
]
# building first layer
assert input_size % 32 == 0
input_channel = int(input_channel * width_mult)
self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
self.features = [conv_bn(3, input_channel, 2)]
# building inverted residual blocks
for t, c, n, s in interverted_residual_setting:
output_channel = int(c * width_mult)
for i in range(n):
if i == 0:
self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
else:
self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
input_channel = output_channel
# building last several layers
self.features.append(conv_1x1_bn(input_channel, self.last_channel))
# make it nn.Sequential
self.features = nn.Sequential(*self.features)
# building classifier
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(self.last_channel, n_class),
)
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = x.mean(3).mean(2)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
def mobilenetv2(pretrained=False, **kwargs):
"""Constructs a MobileNet_V2 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = MobileNetV2(n_class=1000, **kwargs)
if pretrained:
model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
return model
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : resnet.py
@Time : 8/4/19 3:35 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import functools
import torch.nn as nn
import math
from torch.utils.model_zoo import load_url
from modules import InPlaceABNSync
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101'] # resnet101 is coming soon!
model_urls = {
'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 128
super(ResNet, self).__init__()
self.conv1 = conv3x3(3, 64, stride=2)
self.bn1 = BatchNorm2d(64)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = conv3x3(64, 64)
self.bn2 = BatchNorm2d(64)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = conv3x3(64, 128)
self.bn3 = BatchNorm2d(128)
self.relu3 = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7, stride=1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(load_url(model_urls['resnet18']))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
return model
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : resnext.py.py
@Time : 8/11/19 8:58 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import functools
import torch.nn as nn
import math
from torch.utils.model_zoo import load_url
from modules import InPlaceABNSync
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
__all__ = ['ResNeXt', 'resnext101'] # support resnext 101
model_urls = {
'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
}
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class GroupBottleneck(nn.Module):
expansion = 2
def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
super(GroupBottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, groups=groups, bias=False)
self.bn2 = BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
self.bn3 = BatchNorm2d(planes * 2)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNeXt(nn.Module):
def __init__(self, block, layers, groups=32, num_classes=1000):
self.inplanes = 128
super(ResNeXt, self).__init__()
self.conv1 = conv3x3(3, 64, stride=2)
self.bn1 = BatchNorm2d(64)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = conv3x3(64, 64)
self.bn2 = BatchNorm2d(64)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = conv3x3(64, 128)
self.bn3 = BatchNorm2d(128)
self.relu3 = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
self.avgpool = nn.AvgPool2d(7, stride=1)
self.fc = nn.Linear(1024 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1, groups=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, groups, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=groups))
return nn.Sequential(*layers)
def forward(self, x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnext101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on Places
"""
model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
return model
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : aspp.py
@Time : 8/4/19 3:36 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
from modules import InPlaceABNSync
class ASPPModule(nn.Module):
"""
Reference:
Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
"""
def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
bias=False),
InPlaceABNSync(inner_features))
self.conv2 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(inner_features))
self.conv3 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
InPlaceABNSync(inner_features))
self.conv4 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
InPlaceABNSync(inner_features))
self.conv5 = nn.Sequential(
nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
InPlaceABNSync(inner_features))
self.bottleneck = nn.Sequential(
nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(out_features),
nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
bottle = self.bottleneck(out)
return bottle
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : ocnet.py
@Time : 8/4/19 3:36 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import functools
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
from modules import InPlaceABNSync
BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
class _SelfAttentionBlock(nn.Module):
'''
The basic implementation for self-attention block/non-local block
Input:
N X C X H X W
Parameters:
in_channels : the dimension of the input feature map
key_channels : the dimension after the key/query transform
value_channels : the dimension after the value transform
scale : choose the scale to downsample the input feature maps (save memory cost)
Return:
N X C X H X W
position-aware context features.(w/o concate or add with the input)
'''
def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
super(_SelfAttentionBlock, self).__init__()
self.scale = scale
self.in_channels = in_channels
self.out_channels = out_channels
self.key_channels = key_channels
self.value_channels = value_channels
if out_channels == None:
self.out_channels = in_channels
self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
self.f_key = nn.Sequential(
nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
kernel_size=1, stride=1, padding=0),
InPlaceABNSync(self.key_channels),
)
self.f_query = self.f_key
self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
kernel_size=1, stride=1, padding=0)
self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
kernel_size=1, stride=1, padding=0)
nn.init.constant(self.W.weight, 0)
nn.init.constant(self.W.bias, 0)
def forward(self, x):
batch_size, h, w = x.size(0), x.size(2), x.size(3)
if self.scale > 1:
x = self.pool(x)
value = self.f_value(x).view(batch_size, self.value_channels, -1)
value = value.permute(0, 2, 1)
query = self.f_query(x).view(batch_size, self.key_channels, -1)
query = query.permute(0, 2, 1)
key = self.f_key(x).view(batch_size, self.key_channels, -1)
sim_map = torch.matmul(query, key)
sim_map = (self.key_channels ** -.5) * sim_map
sim_map = F.softmax(sim_map, dim=-1)
context = torch.matmul(sim_map, value)
context = context.permute(0, 2, 1).contiguous()
context = context.view(batch_size, self.value_channels, *x.size()[2:])
context = self.W(context)
if self.scale > 1:
context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
return context
class SelfAttentionBlock2D(_SelfAttentionBlock):
def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
super(SelfAttentionBlock2D, self).__init__(in_channels,
key_channels,
value_channels,
out_channels,
scale)
class BaseOC_Module(nn.Module):
"""
Implementation of the BaseOC module
Parameters:
in_features / out_features: the channels of the input / output feature maps.
dropout: we choose 0.05 as the default value.
size: you can apply multiple sizes. Here we only use one size.
Return:
features fused with Object context information.
"""
def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
super(BaseOC_Module, self).__init__()
self.stages = []
self.stages = nn.ModuleList(
[self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
self.conv_bn_dropout = nn.Sequential(
nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
InPlaceABNSync(out_channels),
nn.Dropout2d(dropout)
)
def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
return SelfAttentionBlock2D(in_channels,
key_channels,
value_channels,
output_channels,
size)
def forward(self, feats):
priors = [stage(feats) for stage in self.stages]
context = priors[0]
for i in range(1, len(priors)):
context += priors[i]
output = self.conv_bn_dropout(torch.cat([context, feats], 1))
return output
class BaseOC_Context_Module(nn.Module):
"""
Output only the context features.
Parameters:
in_features / out_features: the channels of the input / output feature maps.
dropout: specify the dropout ratio
fusion: We provide two different fusion method, "concat" or "add"
size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
Return:
features after "concat" or "add"
"""
def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
super(BaseOC_Context_Module, self).__init__()
self.stages = []
self.stages = nn.ModuleList(
[self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
self.conv_bn_dropout = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
InPlaceABNSync(out_channels),
)
def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
return SelfAttentionBlock2D(in_channels,
key_channels,
value_channels,
output_channels,
size)
def forward(self, feats):
priors = [stage(feats) for stage in self.stages]
context = priors[0]
for i in range(1, len(priors)):
context += priors[i]
output = self.conv_bn_dropout(context)
return output
class ASP_OC_Module(nn.Module):
def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
super(ASP_OC_Module, self).__init__()
self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
InPlaceABNSync(out_features),
BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
key_channels=out_features // 2, value_channels=out_features,
dropout=0, sizes=([2])))
self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(out_features))
self.conv3 = nn.Sequential(
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
InPlaceABNSync(out_features))
self.conv4 = nn.Sequential(
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
InPlaceABNSync(out_features))
self.conv5 = nn.Sequential(
nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
InPlaceABNSync(out_features))
self.conv_bn_dropout = nn.Sequential(
nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
InPlaceABNSync(out_features),
nn.Dropout2d(0.1)
)
def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
assert (len(feat1) == len(feat2))
z = []
for i in range(len(feat1)):
z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
return z
def forward(self, x):
if isinstance(x, Variable):
_, _, h, w = x.size()
elif isinstance(x, tuple) or isinstance(x, list):
_, _, h, w = x[0].size()
else:
raise RuntimeError('unknown input type')
feat1 = self.context(x)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
feat5 = self.conv5(x)
if isinstance(x, Variable):
out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
elif isinstance(x, tuple) or isinstance(x, list):
out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
else:
raise RuntimeError('unknown input type')
output = self.conv_bn_dropout(out)
return output
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : psp.py
@Time : 8/4/19 3:36 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
from modules import InPlaceABNSync
class PSPModule(nn.Module):
"""
Reference:
Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
"""
def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
super(PSPModule, self).__init__()
self.stages = []
self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
self.bottleneck = nn.Sequential(
nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
bias=False),
InPlaceABNSync(out_features),
)
def _make_stage(self, features, out_features, size):
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
bn = InPlaceABNSync(out_features)
return nn.Sequential(prior, conv, bn)
def forward(self, feats):
h, w = feats.size(2), feats.size(3)
priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
self.stages] + [feats]
bottle = self.bottleneck(torch.cat(priors, 1))
return bottle
\ No newline at end of file
opencv-python==4.4.0.46
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from typing import List
import cv2
import torch
import numpy as np
from PIL import Image
import torchvision.transforms as transforms
from app.models.schp import networks
from app.models.schp.utils.transforms import get_affine_transform, transform_logits
# 数据集设置
dataset_settings = {
'lip': {
'input_size': [473, 473],
'num_classes': 20,
'label': ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat',
'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm',
'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
},
'atr': {
'input_size': [512, 512],
'num_classes': 18,
'label': ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt',
'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
},
'pascal': {
'input_size': [512, 512],
'num_classes': 7,
'label': ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'],
}
}
def get_color_by_label(label: str) -> List[int]:
"""
根据标签名称获取对应的RGB颜色值
Args:
label (str): 标签名称,如 'Face', 'Hair' 等
Returns:
List[int]: RGB颜色值列表,格式为 [R, G, B],值范围0-255
"""
# LIP数据集标签
labels = ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes',
'Dress', 'Coat', 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt',
'Face', 'Left-arm', 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
# 检查标签是否存在
if label not in labels:
return []
# 获取标签索引
label_index = labels.index(label)
# 生成调色板
palette = get_palette(len(labels))
# 获取对应颜色的RGB值并返回列表
r = palette[label_index * 3 + 0]
g = palette[label_index * 3 + 1]
b = palette[label_index * 3 + 2]
return [r, g, b]
def get_palette(num_cls):
"""返回用于可视化分割掩码的颜色映射"""
n = num_cls
palette = [0] * (n * 3)
for j in range(0, n):
lab = j
palette[j * 3 + 0] = 0
palette[j * 3 + 1] = 0
palette[j * 3 + 2] = 0
i = 0
while lab:
palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
i += 1
lab >>= 3
return palette
def _box2cs(box, aspect_ratio):
"""将边界框转换为中心点和尺度"""
x, y, w, h = box[:4]
return _xywh2cs(x, y, w, h, aspect_ratio)
def _xywh2cs(x, y, w, h, aspect_ratio):
"""将xywh格式转换为中心点和尺度"""
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
scale = np.array([w, h], dtype=np.float32)
return center, scale
class HumanParsingModel:
def __init__(self, model_path, dataset='atr', device=None):
"""
初始化人体解析模型
Args:
model_path: 预训练模型路径
dataset: 数据集类型 ('lip', 'atr', 'pascal')
device: 计算设备 (None表示自动选择)
"""
self.dataset = dataset
self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 获取数据集设置
self.num_classes = dataset_settings[dataset]['num_classes']
self.input_size = dataset_settings[dataset]['input_size']
self.label = dataset_settings[dataset]['label']
self.aspect_ratio = self.input_size[1] * 1.0 / self.input_size[0]
self.input_size_array = np.asarray(self.input_size)
# 初始化模型
self.model = networks.init_model('resnet101', num_classes=self.num_classes, pretrained=None)
# 加载预训练权重
state_dict = torch.load(model_path, map_location=self.device)['state_dict']
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] if k.startswith('module.') else k # 移除 'module.' 前缀
new_state_dict[name] = v
self.model.load_state_dict(new_state_dict)
# 将模型移动到指定设备并设置为评估模式
self.model.to(self.device)
self.model.eval()
# 图像预处理变换
self.transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
])
# 获取调色板
self.palette = get_palette(self.num_classes)
print(f"模型已加载,使用设备: {self.device}")
print(f"数据集: {dataset}, 类别数: {self.num_classes}")
def process_single_image(model, input_image):
"""
处理单张图片
Args:
model: HumanParsingModel实例
input_image: 输入图片,可以是:
- numpy数组 (H, W, C) BGR格式
- PIL Image对象
- 图片文件路径字符串
Returns:
PIL Image对象,包含分割结果的彩色图像
"""
# 处理不同类型的输入
if isinstance(input_image, str):
# 如果是文件路径
img = cv2.imread(input_image, cv2.IMREAD_COLOR)
if img is None:
raise ValueError(f"无法读取图片: {input_image}")
elif isinstance(input_image, Image.Image):
# 如果是PIL Image,转换为BGR numpy数组
img = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
elif isinstance(input_image, np.ndarray):
# 如果是numpy数组,直接使用
img = input_image.copy()
else:
raise ValueError("输入图片格式不支持,请使用numpy数组、PIL Image或文件路径")
h, w, _ = img.shape
# 获取人体中心点和尺度
person_center, s = _box2cs([0, 0, w - 1, h - 1], model.aspect_ratio)
r = 0
# 获取仿射变换矩阵
trans = get_affine_transform(person_center, s, r, model.input_size_array)
# 应用仿射变换
input_tensor = cv2.warpAffine(
img,
trans,
(int(model.input_size[1]), int(model.input_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0)
)
# 预处理
input_tensor = model.transform(input_tensor)
input_tensor = input_tensor.unsqueeze(0) # 添加batch维度
input_tensor = input_tensor.to(model.device)
# 模型推理
with torch.no_grad():
output = model.model(input_tensor)
# 上采样到输入尺寸
upsample = torch.nn.Upsample(size=model.input_size, mode='bilinear', align_corners=True)
upsample_output = upsample(output[0][-1][0].unsqueeze(0))
upsample_output = upsample_output.squeeze()
upsample_output = upsample_output.permute(1, 2, 0) # CHW -> HWC
# 变换回原始图像尺寸
logits_result = transform_logits(
upsample_output.data.cpu().numpy(),
person_center,
s,
w,
h,
input_size=model.input_size
)
# 获取分割结果
parsing_result = np.argmax(logits_result, axis=2)
# 转换为PIL图像并应用调色板
output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
output_img.putpalette(model.palette)
return output_img
# 使用示例函数
def parse_human_image(input_image, model_path, dataset='lip', device=None):
"""
便捷函数:解析单张人体图像
Args:
input_image: 输入图片 (numpy数组、PIL Image或文件路径)
model_path: 预训练模型路径
dataset: 数据集类型 ('lip', 'atr', 'pascal')
device: 计算设备
Returns:
PIL Image对象,包含分割结果
"""
# 创建模型实例
model = HumanParsingModel(model_path, dataset, device)
# 处理图像
result = process_single_image(model, input_image)
return result
# 使用示例
if __name__ == '__main__':
# 示例1:使用便捷函数
model_path = r"D:\work\PycharmProjects\PythonProject\checkpoints\exp-schp-201908261155-lip.pth"
input_image_path = r"D:\work\PycharmProjects\PythonProject\img1.jpg"
# result_image = parse_human_image(input_image_path, model_path, dataset='atr')
# result_image.save(r"D:\work\PycharmProjects\PythonProject\output_result.png")
# 示例2:使用类的方式(推荐用于批量处理)
model = HumanParsingModel(model_path, dataset='lip')
# 处理多张图片
image_paths = [r"D:\work\PycharmProjects\PythonProject\img1.jpg", r"D:\work\PycharmProjects\PythonProject\img2.jpg", r"D:\work\PycharmProjects\PythonProject\img3.jpg"]
for i, img_path in enumerate(image_paths):
result = process_single_image(model, img_path)
result.save(f"result_{i}.png")
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : train.py
@Time : 8/4/19 3:36 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import json
import timeit
import argparse
import torch
import torch.optim as optim
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn
from torch.utils import data
import networks
import utils.schp as schp
from datasets.datasets import LIPDataSet
from datasets.target_generation import generate_edge_tensor
from utils.transforms import BGR2RGB_transform
from utils.criterion import CriterionAll
from utils.encoding import DataParallelModel, DataParallelCriterion
from utils.warmup_scheduler import SGDRScheduler
def get_arguments():
"""Parse all the arguments provided from the CLI.
Returns:
A list of parsed arguments.
"""
parser = argparse.ArgumentParser(description="Self Correction for Human Parsing")
# Network Structure
parser.add_argument("--arch", type=str, default='resnet101')
# Data Preference
parser.add_argument("--data-dir", type=str, default='./data/LIP')
parser.add_argument("--batch-size", type=int, default=16)
parser.add_argument("--input-size", type=str, default='473,473')
parser.add_argument("--num-classes", type=int, default=20)
parser.add_argument("--ignore-label", type=int, default=255)
parser.add_argument("--random-mirror", action="store_true")
parser.add_argument("--random-scale", action="store_true")
# Training Strategy
parser.add_argument("--learning-rate", type=float, default=7e-3)
parser.add_argument("--momentum", type=float, default=0.9)
parser.add_argument("--weight-decay", type=float, default=5e-4)
parser.add_argument("--gpu", type=str, default='0,1,2')
parser.add_argument("--start-epoch", type=int, default=0)
parser.add_argument("--epochs", type=int, default=150)
parser.add_argument("--eval-epochs", type=int, default=10)
parser.add_argument("--imagenet-pretrain", type=str, default='./pretrain_model/resnet101-imagenet.pth')
parser.add_argument("--log-dir", type=str, default='./log')
parser.add_argument("--model-restore", type=str, default='./log/checkpoint.pth.tar')
parser.add_argument("--schp-start", type=int, default=100, help='schp start epoch')
parser.add_argument("--cycle-epochs", type=int, default=10, help='schp cyclical epoch')
parser.add_argument("--schp-restore", type=str, default='./log/schp_checkpoint.pth.tar')
parser.add_argument("--lambda-s", type=float, default=1, help='segmentation loss weight')
parser.add_argument("--lambda-e", type=float, default=1, help='edge loss weight')
parser.add_argument("--lambda-c", type=float, default=0.1, help='segmentation-edge consistency loss weight')
return parser.parse_args()
def main():
args = get_arguments()
print(args)
start_epoch = 0
cycle_n = 0
if not os.path.exists(args.log_dir):
os.makedirs(args.log_dir)
with open(os.path.join(args.log_dir, 'args.json'), 'w') as opt_file:
json.dump(vars(args), opt_file)
gpus = [int(i) for i in args.gpu.split(',')]
if not args.gpu == 'None':
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
input_size = list(map(int, args.input_size.split(',')))
cudnn.enabled = True
cudnn.benchmark = True
# Model Initialization
AugmentCE2P = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=args.imagenet_pretrain)
model = DataParallelModel(AugmentCE2P)
model.cuda()
IMAGE_MEAN = AugmentCE2P.mean
IMAGE_STD = AugmentCE2P.std
INPUT_SPACE = AugmentCE2P.input_space
print('image mean: {}'.format(IMAGE_MEAN))
print('image std: {}'.format(IMAGE_STD))
print('input space:{}'.format(INPUT_SPACE))
restore_from = args.model_restore
if os.path.exists(restore_from):
print('Resume training from {}'.format(restore_from))
checkpoint = torch.load(restore_from)
model.load_state_dict(checkpoint['state_dict'])
start_epoch = checkpoint['epoch']
SCHP_AugmentCE2P = networks.init_model(args.arch, num_classes=args.num_classes, pretrained=args.imagenet_pretrain)
schp_model = DataParallelModel(SCHP_AugmentCE2P)
schp_model.cuda()
if os.path.exists(args.schp_restore):
print('Resuming schp checkpoint from {}'.format(args.schp_restore))
schp_checkpoint = torch.load(args.schp_restore)
schp_model_state_dict = schp_checkpoint['state_dict']
cycle_n = schp_checkpoint['cycle_n']
schp_model.load_state_dict(schp_model_state_dict)
# Loss Function
criterion = CriterionAll(lambda_1=args.lambda_s, lambda_2=args.lambda_e, lambda_3=args.lambda_c,
num_classes=args.num_classes)
criterion = DataParallelCriterion(criterion)
criterion.cuda()
# Data Loader
if INPUT_SPACE == 'BGR':
print('BGR Transformation')
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=IMAGE_MEAN,
std=IMAGE_STD),
])
elif INPUT_SPACE == 'RGB':
print('RGB Transformation')
transform = transforms.Compose([
transforms.ToTensor(),
BGR2RGB_transform(),
transforms.Normalize(mean=IMAGE_MEAN,
std=IMAGE_STD),
])
train_dataset = LIPDataSet(args.data_dir, 'train', crop_size=input_size, transform=transform)
train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size * len(gpus),
num_workers=16, shuffle=True, pin_memory=True, drop_last=True)
print('Total training samples: {}'.format(len(train_dataset)))
# Optimizer Initialization
optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum,
weight_decay=args.weight_decay)
lr_scheduler = SGDRScheduler(optimizer, total_epoch=args.epochs,
eta_min=args.learning_rate / 100, warmup_epoch=10,
start_cyclical=args.schp_start, cyclical_base_lr=args.learning_rate / 2,
cyclical_epoch=args.cycle_epochs)
total_iters = args.epochs * len(train_loader)
start = timeit.default_timer()
for epoch in range(start_epoch, args.epochs):
lr_scheduler.step(epoch=epoch)
lr = lr_scheduler.get_lr()[0]
model.train()
for i_iter, batch in enumerate(train_loader):
i_iter += len(train_loader) * epoch
images, labels, _ = batch
labels = labels.cuda(non_blocking=True)
edges = generate_edge_tensor(labels)
labels = labels.type(torch.cuda.LongTensor)
edges = edges.type(torch.cuda.LongTensor)
preds = model(images)
# Online Self Correction Cycle with Label Refinement
if cycle_n >= 1:
with torch.no_grad():
soft_preds = schp_model(images)
soft_parsing = []
soft_edge = []
for soft_pred in soft_preds:
soft_parsing.append(soft_pred[0][-1])
soft_edge.append(soft_pred[1][-1])
soft_preds = torch.cat(soft_parsing, dim=0)
soft_edges = torch.cat(soft_edge, dim=0)
else:
soft_preds = None
soft_edges = None
loss = criterion(preds, [labels, edges, soft_preds, soft_edges], cycle_n)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i_iter % 100 == 0:
print('iter = {} of {} completed, lr = {}, loss = {}'.format(i_iter, total_iters, lr,
loss.data.cpu().numpy()))
if (epoch + 1) % (args.eval_epochs) == 0:
schp.save_schp_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
}, False, args.log_dir, filename='checkpoint_{}.pth.tar'.format(epoch + 1))
# Self Correction Cycle with Model Aggregation
if (epoch + 1) >= args.schp_start and (epoch + 1 - args.schp_start) % args.cycle_epochs == 0:
print('Self-correction cycle number {}'.format(cycle_n))
schp.moving_average(schp_model, model, 1.0 / (cycle_n + 1))
cycle_n += 1
schp.bn_re_estimate(train_loader, schp_model)
schp.save_schp_checkpoint({
'state_dict': schp_model.state_dict(),
'cycle_n': cycle_n,
}, False, args.log_dir, filename='schp_{}_checkpoint.pth.tar'.format(cycle_n))
torch.cuda.empty_cache()
end = timeit.default_timer()
print('epoch = {} of {} completed using {} s'.format(epoch, args.epochs,
(end - start) / (epoch - start_epoch + 1)))
end = timeit.default_timer()
print('Training Finished in {} seconds'.format(end - start))
if __name__ == '__main__':
main()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : kl_loss.py
@Time : 7/23/19 4:02 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch
import torch.nn.functional as F
from torch import nn
from datasets.target_generation import generate_edge_tensor
class ConsistencyLoss(nn.Module):
def __init__(self, ignore_index=255):
super(ConsistencyLoss, self).__init__()
self.ignore_index=ignore_index
def forward(self, parsing, edge, label):
parsing_pre = torch.argmax(parsing, dim=1)
parsing_pre[label==self.ignore_index]=self.ignore_index
generated_edge = generate_edge_tensor(parsing_pre)
edge_pre = torch.argmax(edge, dim=1)
v_generate_edge = generated_edge[label!=255]
v_edge_pre = edge_pre[label!=255]
v_edge_pre = v_edge_pre.type(torch.cuda.FloatTensor)
positive_union = (v_generate_edge==1)&(v_edge_pre==1) # only the positive values count
return F.smooth_l1_loss(v_generate_edge[positive_union].squeeze(0), v_edge_pre[positive_union].squeeze(0))
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : criterion.py
@Time : 8/30/19 8:59 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch.nn as nn
import torch
import numpy as np
from torch.nn import functional as F
from .lovasz_softmax import LovaszSoftmax
from .kl_loss import KLDivergenceLoss
from .consistency_loss import ConsistencyLoss
NUM_CLASSES = 20
class CriterionAll(nn.Module):
def __init__(self, use_class_weight=False, ignore_index=255, lambda_1=1, lambda_2=1, lambda_3=1,
num_classes=20):
super(CriterionAll, self).__init__()
self.ignore_index = ignore_index
self.use_class_weight = use_class_weight
self.criterion = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
self.lovasz = LovaszSoftmax(ignore_index=ignore_index)
self.kldiv = KLDivergenceLoss(ignore_index=ignore_index)
self.reg = ConsistencyLoss(ignore_index=ignore_index)
self.lamda_1 = lambda_1
self.lamda_2 = lambda_2
self.lamda_3 = lambda_3
self.num_classes = num_classes
def parsing_loss(self, preds, target, cycle_n=None):
"""
Loss function definition.
Args:
preds: [[parsing result1, parsing result2],[edge result]]
target: [parsing label, egde label]
soft_preds: [[parsing result1, parsing result2],[edge result]]
Returns:
Calculated Loss.
"""
h, w = target[0].size(1), target[0].size(2)
pos_num = torch.sum(target[1] == 1, dtype=torch.float)
neg_num = torch.sum(target[1] == 0, dtype=torch.float)
weight_pos = neg_num / (pos_num + neg_num)
weight_neg = pos_num / (pos_num + neg_num)
weights = torch.tensor([weight_neg, weight_pos]) # edge loss weight
loss = 0
# loss for segmentation
preds_parsing = preds[0]
for pred_parsing in preds_parsing:
scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
mode='bilinear', align_corners=True)
loss += 0.5 * self.lamda_1 * self.lovasz(scale_pred, target[0])
if target[2] is None:
loss += 0.5 * self.lamda_1 * self.criterion(scale_pred, target[0])
else:
soft_scale_pred = F.interpolate(input=target[2], size=(h, w),
mode='bilinear', align_corners=True)
soft_scale_pred = moving_average(soft_scale_pred, to_one_hot(target[0], num_cls=self.num_classes),
1.0 / (cycle_n + 1.0))
loss += 0.5 * self.lamda_1 * self.kldiv(scale_pred, soft_scale_pred, target[0])
# loss for edge
preds_edge = preds[1]
for pred_edge in preds_edge:
scale_pred = F.interpolate(input=pred_edge, size=(h, w),
mode='bilinear', align_corners=True)
if target[3] is None:
loss += self.lamda_2 * F.cross_entropy(scale_pred, target[1],
weights.cuda(), ignore_index=self.ignore_index)
else:
soft_scale_edge = F.interpolate(input=target[3], size=(h, w),
mode='bilinear', align_corners=True)
soft_scale_edge = moving_average(soft_scale_edge, to_one_hot(target[1], num_cls=2),
1.0 / (cycle_n + 1.0))
loss += self.lamda_2 * self.kldiv(scale_pred, soft_scale_edge, target[0])
# consistency regularization
preds_parsing = preds[0]
preds_edge = preds[1]
for pred_parsing in preds_parsing:
scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
mode='bilinear', align_corners=True)
scale_edge = F.interpolate(input=preds_edge[0], size=(h, w),
mode='bilinear', align_corners=True)
loss += self.lamda_3 * self.reg(scale_pred, scale_edge, target[0])
return loss
def forward(self, preds, target, cycle_n=None):
loss = self.parsing_loss(preds, target, cycle_n)
return loss
def _generate_weights(self, masks, num_classes):
"""
masks: torch.Tensor with shape [B, H, W]
"""
masks_label = masks.data.cpu().numpy().astype(np.int64)
pixel_nums = []
tot_pixels = 0
for i in range(num_classes):
pixel_num_of_cls_i = np.sum(masks_label == i).astype(np.float)
pixel_nums.append(pixel_num_of_cls_i)
tot_pixels += pixel_num_of_cls_i
weights = []
for i in range(num_classes):
weights.append(
(tot_pixels - pixel_nums[i]) / tot_pixels / (num_classes - 1)
)
weights = np.array(weights, dtype=np.float)
# weights = torch.from_numpy(weights).float().to(masks.device)
return weights
def moving_average(target1, target2, alpha=1.0):
target = 0
target += (1.0 - alpha) * target1
target += target2 * alpha
return target
def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
b, h, w = tensor.shape
tensor[tensor == ignore_index] = 0
onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
return onehot_tensor
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Created by: Hang Zhang
## ECE Department, Rutgers University
## Email: zhang.hang@rutgers.edu
## Copyright (c) 2017
##
## This source code is licensed under the MIT-style license found in the
## LICENSE file in the root directory of this source tree
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""Encoding Data Parallel"""
import threading
import functools
import torch
from torch.autograd import Variable, Function
import torch.cuda.comm as comm
from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.parallel.parallel_apply import get_a_var
from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
torch_ver = torch.__version__[:3]
__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion', 'patch_replication_callback']
def allreduce(*inputs):
"""Cross GPU all reduce autograd operation for calculate mean and
variance in SyncBN.
"""
return AllReduce.apply(*inputs)
class AllReduce(Function):
@staticmethod
def forward(ctx, num_inputs, *inputs):
ctx.num_inputs = num_inputs
ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
inputs = [inputs[i:i + num_inputs]
for i in range(0, len(inputs), num_inputs)]
# sort before reduce sum
inputs = sorted(inputs, key=lambda i: i[0].get_device())
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return tuple([t for tensors in outputs for t in tensors])
@staticmethod
def backward(ctx, *inputs):
inputs = [i.data for i in inputs]
inputs = [inputs[i:i + ctx.num_inputs]
for i in range(0, len(inputs), ctx.num_inputs)]
results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
class Reduce(Function):
@staticmethod
def forward(ctx, *inputs):
ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
inputs = sorted(inputs, key=lambda i: i.get_device())
return comm.reduce_add(inputs)
@staticmethod
def backward(ctx, gradOutput):
return Broadcast.apply(ctx.target_gpus, gradOutput)
class DataParallelModel(DataParallel):
"""Implements data parallelism at the module level.
This container parallelizes the application of the given module by
splitting the input across the specified devices by chunking in the
batch dimension.
In the forward pass, the module is replicated on each device,
and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
Note that the outputs are not gathered, please use compatible
:class:`encoding.parallel.DataParallelCriterion`.
The batch size should be larger than the number of GPUs used. It should
also be an integer multiple of the number of GPUs so that each chunk is
the same size (so that each GPU processes the same number of samples).
Args:
module: module to be parallelized
device_ids: CUDA devices (default: all devices)
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
Amit Agrawal. “Context Encoding for Semantic Segmentation.
*The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
Example::
>>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
>>> y = net(x)
"""
def gather(self, outputs, output_device):
return outputs
def replicate(self, module, device_ids):
modules = super(DataParallelModel, self).replicate(module, device_ids)
return modules
class DataParallelCriterion(DataParallel):
"""
Calculate loss in multiple-GPUs, which balance the memory usage for
Semantic Segmentation.
The targets are splitted across the specified devices by chunking in
the batch dimension. Please use together with :class:`encoding.parallel.DataParallelModel`.
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
Amit Agrawal. “Context Encoding for Semantic Segmentation.
*The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
Example::
>>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
>>> criterion = encoding.nn.DataParallelCriterion(criterion, device_ids=[0, 1, 2])
>>> y = net(x)
>>> loss = criterion(y, target)
"""
def forward(self, inputs, *targets, **kwargs):
# input should be already scatterd
# scattering the targets instead
if not self.device_ids:
return self.module(inputs, *targets, **kwargs)
targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
if len(self.device_ids) == 1:
return self.module(inputs, *targets[0], **kwargs[0])
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
return Reduce.apply(*outputs) / len(outputs)
def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
assert len(modules) == len(inputs)
assert len(targets) == len(inputs)
if kwargs_tup:
assert len(modules) == len(kwargs_tup)
else:
kwargs_tup = ({},) * len(modules)
if devices is not None:
assert len(modules) == len(devices)
else:
devices = [None] * len(modules)
lock = threading.Lock()
results = {}
if torch_ver != "0.3":
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, target, kwargs, device=None):
if torch_ver != "0.3":
torch.set_grad_enabled(grad_enabled)
if device is None:
device = get_a_var(input).get_device()
try:
if not isinstance(input, tuple):
input = (input,)
with torch.cuda.device(device):
output = module(*(input + target), **kwargs)
with lock:
results[i] = output
except Exception as e:
with lock:
results[i] = e
if len(modules) > 1:
threads = [threading.Thread(target=_worker,
args=(i, module, input, target,
kwargs, device),)
for i, (module, input, target, kwargs, device) in
enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
else:
_worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
outputs = []
for i in range(len(inputs)):
output = results[i]
if isinstance(output, Exception):
raise output
outputs.append(output)
return outputs
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : kl_loss.py
@Time : 7/23/19 4:02 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch
import torch.nn.functional as F
from torch import nn
def flatten_probas(input, target, labels, ignore=255):
"""
Flattens predictions in the batch.
"""
B, C, H, W = input.size()
input = input.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
target = target.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
labels = labels.view(-1)
if ignore is None:
return input, target
valid = (labels != ignore)
vinput = input[valid.nonzero().squeeze()]
vtarget = target[valid.nonzero().squeeze()]
return vinput, vtarget
class KLDivergenceLoss(nn.Module):
def __init__(self, ignore_index=255, T=1):
super(KLDivergenceLoss, self).__init__()
self.ignore_index=ignore_index
self.T = T
def forward(self, input, target, label):
log_input_prob = F.log_softmax(input / self.T, dim=1)
target_porb = F.softmax(target / self.T, dim=1)
loss = F.kl_div(*flatten_probas(log_input_prob, target_porb, label, ignore=self.ignore_index))
return self.T*self.T*loss # balanced
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : lovasz_softmax.py
@Time : 8/30/19 7:12 PM
@Desc : Lovasz-Softmax and Jaccard hinge loss in PyTorch
Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
from __future__ import print_function, division
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from torch import nn
try:
from itertools import ifilterfalse
except ImportError: # py3k
from itertools import filterfalse as ifilterfalse
def lovasz_grad(gt_sorted):
"""
Computes gradient of the Lovasz extension w.r.t sorted errors
See Alg. 1 in paper
"""
p = len(gt_sorted)
gts = gt_sorted.sum()
intersection = gts - gt_sorted.float().cumsum(0)
union = gts + (1 - gt_sorted).float().cumsum(0)
jaccard = 1. - intersection / union
if p > 1: # cover 1-pixel case
jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
return jaccard
def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
"""
IoU for foreground class
binary: 1 foreground, 0 background
"""
if not per_image:
preds, labels = (preds,), (labels,)
ious = []
for pred, label in zip(preds, labels):
intersection = ((label == 1) & (pred == 1)).sum()
union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
if not union:
iou = EMPTY
else:
iou = float(intersection) / float(union)
ious.append(iou)
iou = mean(ious) # mean accross images if per_image
return 100 * iou
def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
"""
Array of IoU for each (non ignored) class
"""
if not per_image:
preds, labels = (preds,), (labels,)
ious = []
for pred, label in zip(preds, labels):
iou = []
for i in range(C):
if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes)
intersection = ((label == i) & (pred == i)).sum()
union = ((label == i) | ((pred == i) & (label != ignore))).sum()
if not union:
iou.append(EMPTY)
else:
iou.append(float(intersection) / float(union))
ious.append(iou)
ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image
return 100 * np.array(ious)
# --------------------------- BINARY LOSSES ---------------------------
def lovasz_hinge(logits, labels, per_image=True, ignore=None):
"""
Binary Lovasz hinge loss
logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
per_image: compute the loss per image instead of per batch
ignore: void class id
"""
if per_image:
loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
for log, lab in zip(logits, labels))
else:
loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
return loss
def lovasz_hinge_flat(logits, labels):
"""
Binary Lovasz hinge loss
logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
labels: [P] Tensor, binary ground truth labels (0 or 1)
ignore: label to ignore
"""
if len(labels) == 0:
# only void pixels, the gradients should be 0
return logits.sum() * 0.
signs = 2. * labels.float() - 1.
errors = (1. - logits * Variable(signs))
errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
perm = perm.data
gt_sorted = labels[perm]
grad = lovasz_grad(gt_sorted)
loss = torch.dot(F.relu(errors_sorted), Variable(grad))
return loss
def flatten_binary_scores(scores, labels, ignore=None):
"""
Flattens predictions in the batch (binary case)
Remove labels equal to 'ignore'
"""
scores = scores.view(-1)
labels = labels.view(-1)
if ignore is None:
return scores, labels
valid = (labels != ignore)
vscores = scores[valid]
vlabels = labels[valid]
return vscores, vlabels
class StableBCELoss(torch.nn.modules.Module):
def __init__(self):
super(StableBCELoss, self).__init__()
def forward(self, input, target):
neg_abs = - input.abs()
loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
return loss.mean()
def binary_xloss(logits, labels, ignore=None):
"""
Binary Cross entropy loss
logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
ignore: void class id
"""
logits, labels = flatten_binary_scores(logits, labels, ignore)
loss = StableBCELoss()(logits, Variable(labels.float()))
return loss
# --------------------------- MULTICLASS LOSSES ---------------------------
def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=255, weighted=None):
"""
Multi-class Lovasz-Softmax loss
probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
per_image: compute the loss per image instead of per batch
ignore: void class labels
"""
if per_image:
loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes, weighted=weighted)
for prob, lab in zip(probas, labels))
else:
loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes, weighted=weighted )
return loss
def lovasz_softmax_flat(probas, labels, classes='present', weighted=None):
"""
Multi-class Lovasz-Softmax loss
probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
labels: [P] Tensor, ground truth labels (between 0 and C - 1)
classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
"""
if probas.numel() == 0:
# only void pixels, the gradients should be 0
return probas * 0.
C = probas.size(1)
losses = []
class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
for c in class_to_sum:
fg = (labels == c).float() # foreground for class c
if (classes is 'present' and fg.sum() == 0):
continue
if C == 1:
if len(classes) > 1:
raise ValueError('Sigmoid output possible only with 1 class')
class_pred = probas[:, 0]
else:
class_pred = probas[:, c]
errors = (Variable(fg) - class_pred).abs()
errors_sorted, perm = torch.sort(errors, 0, descending=True)
perm = perm.data
fg_sorted = fg[perm]
if weighted is not None:
losses.append(weighted[c]*torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
else:
losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
return mean(losses)
def flatten_probas(probas, labels, ignore=None):
"""
Flattens predictions in the batch
"""
if probas.dim() == 3:
# assumes output of a sigmoid layer
B, H, W = probas.size()
probas = probas.view(B, 1, H, W)
B, C, H, W = probas.size()
probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
labels = labels.view(-1)
if ignore is None:
return probas, labels
valid = (labels != ignore)
vprobas = probas[valid.nonzero().squeeze()]
vlabels = labels[valid]
return vprobas, vlabels
def xloss(logits, labels, ignore=None):
"""
Cross entropy loss
"""
return F.cross_entropy(logits, Variable(labels), ignore_index=255)
# --------------------------- HELPER FUNCTIONS ---------------------------
def isnan(x):
return x != x
def mean(l, ignore_nan=False, empty=0):
"""
nanmean compatible with generators.
"""
l = iter(l)
if ignore_nan:
l = ifilterfalse(isnan, l)
try:
n = 1
acc = next(l)
except StopIteration:
if empty == 'raise':
raise ValueError('Empty mean')
return empty
for n, v in enumerate(l, 2):
acc += v
if n == 1:
return acc
return acc / n
# --------------------------- Class ---------------------------
class LovaszSoftmax(nn.Module):
def __init__(self, per_image=False, ignore_index=255, weighted=None):
super(LovaszSoftmax, self).__init__()
self.lovasz_softmax = lovasz_softmax
self.per_image = per_image
self.ignore_index=ignore_index
self.weighted = weighted
def forward(self, pred, label):
pred = F.softmax(pred, dim=1)
return self.lovasz_softmax(pred, label, per_image=self.per_image, ignore=self.ignore_index, weighted=self.weighted)
\ No newline at end of file
import cv2
import os
import numpy as np
from collections import OrderedDict
from PIL import Image as PILImage
from utils.transforms import transform_parsing
LABELS = ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', \
'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg',
'Right-leg', 'Left-shoe', 'Right-shoe']
# LABELS = ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs']
def get_palette(num_cls):
""" Returns the color map for visualizing the segmentation mask.
Args:
num_cls: Number of classes
Returns:
The color map
"""
n = num_cls
palette = [0] * (n * 3)
for j in range(0, n):
lab = j
palette[j * 3 + 0] = 0
palette[j * 3 + 1] = 0
palette[j * 3 + 2] = 0
i = 0
while lab:
palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
i += 1
lab >>= 3
return palette
def get_confusion_matrix(gt_label, pred_label, num_classes):
"""
Calcute the confusion matrix by given label and pred
:param gt_label: the ground truth label
:param pred_label: the pred label
:param num_classes: the nunber of class
:return: the confusion matrix
"""
index = (gt_label * num_classes + pred_label).astype('int32')
label_count = np.bincount(index)
confusion_matrix = np.zeros((num_classes, num_classes))
for i_label in range(num_classes):
for i_pred_label in range(num_classes):
cur_index = i_label * num_classes + i_pred_label
if cur_index < len(label_count):
confusion_matrix[i_label, i_pred_label] = label_count[cur_index]
return confusion_matrix
def compute_mean_ioU(preds, scales, centers, num_classes, datadir, input_size=[473, 473], dataset='val'):
val_file = os.path.join(datadir, dataset + '_id.txt')
val_id = [i_id.strip() for i_id in open(val_file)]
confusion_matrix = np.zeros((num_classes, num_classes))
for i, pred_out in enumerate(preds):
im_name = val_id[i]
gt_path = os.path.join(datadir, dataset + '_segmentations', im_name + '.png')
gt = np.array(PILImage.open(gt_path))
h, w = gt.shape
s = scales[i]
c = centers[i]
pred = transform_parsing(pred_out, c, s, w, h, input_size)
gt = np.asarray(gt, dtype=np.int32)
pred = np.asarray(pred, dtype=np.int32)
ignore_index = gt != 255
gt = gt[ignore_index]
pred = pred[ignore_index]
confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
pos = confusion_matrix.sum(1)
res = confusion_matrix.sum(0)
tp = np.diag(confusion_matrix)
pixel_accuracy = (tp.sum() / pos.sum()) * 100
mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
IoU_array = (tp / np.maximum(1.0, pos + res - tp))
IoU_array = IoU_array * 100
mean_IoU = IoU_array.mean()
print('Pixel accuracy: %f \n' % pixel_accuracy)
print('Mean accuracy: %f \n' % mean_accuracy)
print('Mean IU: %f \n' % mean_IoU)
name_value = []
for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
name_value.append((label, iou))
name_value.append(('Pixel accuracy', pixel_accuracy))
name_value.append(('Mean accuracy', mean_accuracy))
name_value.append(('Mean IU', mean_IoU))
name_value = OrderedDict(name_value)
return name_value
def compute_mean_ioU_file(preds_dir, num_classes, datadir, dataset='val'):
list_path = os.path.join(datadir, dataset + '_id.txt')
val_id = [i_id.strip() for i_id in open(list_path)]
confusion_matrix = np.zeros((num_classes, num_classes))
for i, im_name in enumerate(val_id):
gt_path = os.path.join(datadir, 'segmentations', im_name + '.png')
gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
pred_path = os.path.join(preds_dir, im_name + '.png')
pred = np.asarray(PILImage.open(pred_path))
gt = np.asarray(gt, dtype=np.int32)
pred = np.asarray(pred, dtype=np.int32)
ignore_index = gt != 255
gt = gt[ignore_index]
pred = pred[ignore_index]
confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
pos = confusion_matrix.sum(1)
res = confusion_matrix.sum(0)
tp = np.diag(confusion_matrix)
pixel_accuracy = (tp.sum() / pos.sum()) * 100
mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
IoU_array = (tp / np.maximum(1.0, pos + res - tp))
IoU_array = IoU_array * 100
mean_IoU = IoU_array.mean()
print('Pixel accuracy: %f \n' % pixel_accuracy)
print('Mean accuracy: %f \n' % mean_accuracy)
print('Mean IU: %f \n' % mean_IoU)
name_value = []
for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
name_value.append((label, iou))
name_value.append(('Pixel accuracy', pixel_accuracy))
name_value.append(('Mean accuracy', mean_accuracy))
name_value.append(('Mean IU', mean_IoU))
name_value = OrderedDict(name_value)
return name_value
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : schp.py
@Time : 4/8/19 2:11 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import torch
import modules
def moving_average(net1, net2, alpha=1):
for param1, param2 in zip(net1.parameters(), net2.parameters()):
param1.data *= (1.0 - alpha)
param1.data += param2.data * alpha
def _check_bn(module, flag):
if issubclass(module.__class__, modules.bn.InPlaceABNSync):
flag[0] = True
def check_bn(model):
flag = [False]
model.apply(lambda module: _check_bn(module, flag))
return flag[0]
def reset_bn(module):
if issubclass(module.__class__, modules.bn.InPlaceABNSync):
module.running_mean = torch.zeros_like(module.running_mean)
module.running_var = torch.ones_like(module.running_var)
def _get_momenta(module, momenta):
if issubclass(module.__class__, modules.bn.InPlaceABNSync):
momenta[module] = module.momentum
def _set_momenta(module, momenta):
if issubclass(module.__class__, modules.bn.InPlaceABNSync):
module.momentum = momenta[module]
def bn_re_estimate(loader, model):
if not check_bn(model):
print('No batch norm layer detected')
return
model.train()
momenta = {}
model.apply(reset_bn)
model.apply(lambda module: _get_momenta(module, momenta))
n = 0
for i_iter, batch in enumerate(loader):
images, labels, _ = batch
b = images.data.size(0)
momentum = b / (n + b)
for module in momenta.keys():
module.momentum = momentum
model(images)
n += b
model.apply(lambda module: _set_momenta(module, momenta))
def save_schp_checkpoint(states, is_best_parsing, output_dir, filename='schp_checkpoint.pth.tar'):
save_path = os.path.join(output_dir, filename)
if os.path.exists(save_path):
os.remove(save_path)
torch.save(states, save_path)
if is_best_parsing and 'state_dict' in states:
best_save_path = os.path.join(output_dir, 'model_parsing_best.pth.tar')
if os.path.exists(best_save_path):
os.remove(best_save_path)
torch.save(states, best_save_path)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : soft_dice_loss.py
@Time : 8/13/19 5:09 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
from __future__ import print_function, division
import torch
import torch.nn.functional as F
from torch import nn
try:
from itertools import ifilterfalse
except ImportError: # py3k
from itertools import filterfalse as ifilterfalse
def tversky_loss(probas, labels, alpha=0.5, beta=0.5, epsilon=1e-6):
'''
Tversky loss function.
probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
labels: [P] Tensor, ground truth labels (between 0 and C - 1)
Same as soft dice loss when alpha=beta=0.5.
Same as Jaccord loss when alpha=beta=1.0.
See `Tversky loss function for image segmentation using 3D fully convolutional deep networks`
https://arxiv.org/pdf/1706.05721.pdf
'''
C = probas.size(1)
losses = []
for c in list(range(C)):
fg = (labels == c).float()
if fg.sum() == 0:
continue
class_pred = probas[:, c]
p0 = class_pred
p1 = 1 - class_pred
g0 = fg
g1 = 1 - fg
numerator = torch.sum(p0 * g0)
denominator = numerator + alpha * torch.sum(p0 * g1) + beta * torch.sum(p1 * g0)
losses.append(1 - ((numerator) / (denominator + epsilon)))
return mean(losses)
def flatten_probas(probas, labels, ignore=255):
"""
Flattens predictions in the batch
"""
B, C, H, W = probas.size()
probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
labels = labels.view(-1)
if ignore is None:
return probas, labels
valid = (labels != ignore)
vprobas = probas[valid.nonzero().squeeze()]
vlabels = labels[valid]
return vprobas, vlabels
def isnan(x):
return x != x
def mean(l, ignore_nan=False, empty=0):
"""
nanmean compatible with generators.
"""
l = iter(l)
if ignore_nan:
l = ifilterfalse(isnan, l)
try:
n = 1
acc = next(l)
except StopIteration:
if empty == 'raise':
raise ValueError('Empty mean')
return empty
for n, v in enumerate(l, 2):
acc += v
if n == 1:
return acc
return acc / n
class SoftDiceLoss(nn.Module):
def __init__(self, ignore_index=255):
super(SoftDiceLoss, self).__init__()
self.ignore_index = ignore_index
def forward(self, pred, label):
pred = F.softmax(pred, dim=1)
return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=0.5, beta=0.5)
class SoftJaccordLoss(nn.Module):
def __init__(self, ignore_index=255):
super(SoftJaccordLoss, self).__init__()
self.ignore_index = ignore_index
def forward(self, pred, label):
pred = F.softmax(pred, dim=1)
return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=1.0, beta=1.0)
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
# ------------------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import cv2
import torch
class BRG2Tensor_transform(object):
def __call__(self, pic):
img = torch.from_numpy(pic.transpose((2, 0, 1)))
if isinstance(img, torch.ByteTensor):
return img.float()
else:
return img
class BGR2RGB_transform(object):
def __call__(self, tensor):
return tensor[[2,1,0],:,:]
def flip_back(output_flipped, matched_parts):
'''
ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
'''
assert output_flipped.ndim == 4,\
'output_flipped should be [batch_size, num_joints, height, width]'
output_flipped = output_flipped[:, :, :, ::-1]
for pair in matched_parts:
tmp = output_flipped[:, pair[0], :, :].copy()
output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
output_flipped[:, pair[1], :, :] = tmp
return output_flipped
def fliplr_joints(joints, joints_vis, width, matched_parts):
"""
flip coords
"""
# Flip horizontal
joints[:, 0] = width - joints[:, 0] - 1
# Change left-right parts
for pair in matched_parts:
joints[pair[0], :], joints[pair[1], :] = \
joints[pair[1], :], joints[pair[0], :].copy()
joints_vis[pair[0], :], joints_vis[pair[1], :] = \
joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
return joints*joints_vis, joints_vis
def transform_preds(coords, center, scale, input_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
for p in range(coords.shape[0]):
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
return target_coords
def transform_parsing(pred, center, scale, width, height, input_size):
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
target_pred = cv2.warpAffine(
pred,
trans,
(int(width), int(height)), #(int(width), int(height)),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0))
return target_pred
def transform_logits(logits, center, scale, width, height, input_size):
trans = get_affine_transform(center, scale, 0, input_size, inv=1)
channel = logits.shape[2]
target_logits = []
for i in range(channel):
target_logit = cv2.warpAffine(
logits[:,:,i],
trans,
(int(width), int(height)), #(int(width), int(height)),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0))
target_logits.append(target_logit)
target_logits = np.stack(target_logits,axis=2)
return target_logits
def get_affine_transform(center,
scale,
rot,
output_size,
shift=np.array([0, 0], dtype=np.float32),
inv=0):
if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
print(scale)
scale = np.array([scale, scale])
scale_tmp = scale
src_w = scale_tmp[0]
dst_w = output_size[1]
dst_h = output_size[0]
rot_rad = np.pi * rot / 180
src_dir = get_dir([0, src_w * -0.5], rot_rad)
dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
src = np.zeros((3, 2), dtype=np.float32)
dst = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale_tmp * shift
src[1, :] = center + src_dir + scale_tmp * shift
dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
src[2:, :] = get_3rd_point(src[0, :], src[1, :])
dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return trans
def affine_transform(pt, t):
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(t, new_pt)
return new_pt[:2]
def get_3rd_point(a, b):
direct = a - b
return b + np.array([-direct[1], direct[0]], dtype=np.float32)
def get_dir(src_point, rot_rad):
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
src_result = [0, 0]
src_result[0] = src_point[0] * cs - src_point[1] * sn
src_result[1] = src_point[0] * sn + src_point[1] * cs
return src_result
def crop(img, center, scale, output_size, rot=0):
trans = get_affine_transform(center, scale, rot, output_size)
dst_img = cv2.warpAffine(img,
trans,
(int(output_size[1]), int(output_size[0])),
flags=cv2.INTER_LINEAR)
return dst_img
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : warmup_scheduler.py
@Time : 3/28/19 2:24 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import math
from torch.optim.lr_scheduler import _LRScheduler
class GradualWarmupScheduler(_LRScheduler):
""" Gradually warm-up learning rate with cosine annealing in optimizer.
Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
"""
def __init__(self, optimizer, total_epoch, eta_min=0, warmup_epoch=10, last_epoch=-1):
self.total_epoch = total_epoch
self.eta_min = eta_min
self.warmup_epoch = warmup_epoch
super(GradualWarmupScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
if self.last_epoch <= self.warmup_epoch:
return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
else:
return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.total_epoch-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
class SGDRScheduler(_LRScheduler):
""" Consine annealing with warm up and restarts.
Proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts`.
"""
def __init__(self, optimizer, total_epoch=150, start_cyclical=100, cyclical_base_lr=7e-4, cyclical_epoch=10, eta_min=0, warmup_epoch=10, last_epoch=-1):
self.total_epoch = total_epoch
self.start_cyclical = start_cyclical
self.cyclical_epoch = cyclical_epoch
self.cyclical_base_lr = cyclical_base_lr
self.eta_min = eta_min
self.warmup_epoch = warmup_epoch
super(SGDRScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
if self.last_epoch < self.warmup_epoch:
return [self.eta_min + self.last_epoch*(base_lr - self.eta_min)/self.warmup_epoch for base_lr in self.base_lrs]
elif self.last_epoch < self.start_cyclical:
return [self.eta_min + (base_lr-self.eta_min)*(1+math.cos(math.pi*(self.last_epoch-self.warmup_epoch)/(self.start_cyclical-self.warmup_epoch))) / 2 for base_lr in self.base_lrs]
else:
return [self.eta_min + (self.cyclical_base_lr-self.eta_min)*(1+math.cos(math.pi* ((self.last_epoch-self.start_cyclical)% self.cyclical_epoch)/self.cyclical_epoch)) / 2 for base_lr in self.base_lrs]
if __name__ == '__main__':
import matplotlib.pyplot as plt
import torch
model = torch.nn.Linear(10, 2)
optimizer = torch.optim.SGD(params=model.parameters(), lr=7e-3, momentum=0.9, weight_decay=5e-4)
scheduler_warmup = SGDRScheduler(optimizer, total_epoch=150, eta_min=7e-5, warmup_epoch=10, start_cyclical=100, cyclical_base_lr=3.5e-3, cyclical_epoch=10)
lr = []
for epoch in range(0,150):
scheduler_warmup.step(epoch)
lr.append(scheduler_warmup.get_lr())
plt.style.use('ggplot')
plt.plot(list(range(0,150)), lr)
plt.show()
from pathlib import Path
import numpy as np
import torch
import clip
......@@ -6,6 +8,10 @@ from PIL import Image
from io import BytesIO
import logging
from app.models.schp.mask import extract_color_region_simple
from app.models.schp.simple_extractor import HumanParsingModel, process_single_image, get_color_by_label
class FeatureExtractor:
__logger = logging.getLogger(__name__)
......@@ -14,6 +20,8 @@ class FeatureExtractor:
device = "cpu"
self.model, self.preprocess = self.init_model(device, model_name)
self.device = device
model_path = Path(__file__).parent.absolute() / "../models/schp/checkpoints/exp-schp-201908261155-lip.pth"
self.schp_model = HumanParsingModel(model_path, dataset='lip')
@staticmethod
def init_model(device="xpu" if torch.xpu.is_available() else "cpu", model_name="ViT-B/32"):
......@@ -52,7 +60,7 @@ class FeatureExtractor:
return new_img
def extract_from_url(self, image_url):
def extract_from_url(self, image_url, part):
"""
从URL加载图像并提取特征向量
......@@ -71,7 +79,7 @@ class FeatureExtractor:
# 将图片数据转换为 PIL Image 对象
image = Image.open(BytesIO(response.content)).convert("RGB")
return self.extract_from_image(image)
return self.extract_from_image(image, part)
except requests.RequestException as e:
self.__logger.error(f"Network error when downloading image from {image_url}: {e}")
......@@ -80,7 +88,7 @@ class FeatureExtractor:
self.__logger.error(f"Error extracting features from URL {image_url}: {e}")
return None
def extract_from_image(self, img):
def extract_from_image(self, img, part):
"""
从PIL图像对象提取特征向量
......@@ -100,6 +108,12 @@ class FeatureExtractor:
# model_name = "ViT-L/14@336px"
# model, preprocess = self.init_model(device, model_name)
color = get_color_by_label(part)
if color:
mask_img = process_single_image(self.schp_model, img)
img = extract_color_region_simple(img, mask_img, color)
try:
# 调整图像大小并添加填充
image = self.resize_with_padding(img)
......
......@@ -7,10 +7,10 @@ class ImageSearch:
self.feature_extractor = feature_extractor
self.milvus = milvus
def image_to_image_search(self, bucket, image, top_k = 100):
def image_to_image_search(self, bucket, image, part, top_k = 100):
try:
# 提取查询图像的特征
vector = self.feature_extractor.extract_from_url(image)
vector = self.feature_extractor.extract_from_url(image, part)
results = self.milvus.search(bucket, vector, top_k)
......
import unittest
from typing import List
from pymilvus import FieldSchema, DataType
from app.models.schp.simple_extractor import get_color_by_label
from app.services.milvus import MilvusClient
class TestGetColorByLabel(unittest.TestCase):
def test_get_color_by_label(self):
print(get_color_by_label("Jumpsuits"))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment