diff --git a/Makefile b/Makefile index b0b5cbe..e4ce240 100644 --- a/Makefile +++ b/Makefile @@ -222,6 +222,18 @@ vendorize_refiners: find ./imaginairy/vendored/refiners/ -type f -name "*.py" -exec sed -i '' 's/import refiners/import imaginairy.vendored.refiners/g' {} + &&\ make af +vendorize_facexlib: + export REPO=git@github.com:xinntao/facexlib.git PKG=facexlib COMMIT=260620ae93990a300f4b16448df9bb459f1caba9 && \ + make download_repo REPO=$$REPO PKG=$$PKG COMMIT=$$COMMIT && \ + mkdir -p ./imaginairy/vendored/$$PKG && \ + rm -rf ./imaginairy/vendored/$$PKG/* && \ + cp -R ./downloads/$$PKG/facexlib/* ./imaginairy/vendored/$$PKG/ && \ + rm -rf ./imaginairy/vendored/$$PKG/weights && \ + cp ./downloads/$$PKG/LICENSE ./imaginairy/vendored/$$PKG/ && \ + echo "vendored from $$REPO @ $$COMMIT" | tee ./imaginairy/vendored/$$PKG/readme.txt + find ./imaginairy/vendored/facexlib/ -type f -name "*.py" -exec sed -i '' 's/from facexlib/from imaginairy.vendored.facexlib/g' {} + &&\ + sed -i '' '/from \.version import __gitsha__, __version__/d' ./imaginairy/vendored/facexlib/__init__.py + make af vendorize: ## vendorize a github repo. `make vendorize REPO=git@github.com:openai/CLIP.git PKG=clip` mkdir -p ./downloads diff --git a/imaginairy/enhancers/face_restoration_codeformer.py b/imaginairy/enhancers/face_restoration_codeformer.py index 010f7f2..fea0520 100644 --- a/imaginairy/enhancers/face_restoration_codeformer.py +++ b/imaginairy/enhancers/face_restoration_codeformer.py @@ -5,20 +5,62 @@ from functools import lru_cache import numpy as np import torch -from facexlib.utils.face_restoration_helper import FaceRestoreHelper from PIL import Image from torchvision.transforms.functional import normalize from imaginairy.utils.model_manager import get_cached_url_path from imaginairy.vendored.basicsr.img_util import img2tensor, tensor2img from imaginairy.vendored.codeformer.codeformer_arch import CodeFormer +from imaginairy.vendored.facexlib.utils.face_restoration_helper import FaceRestoreHelper logger = logging.getLogger(__name__) + face_restore_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") half_mode = face_restore_device == "cuda" +def load_file_from_url( + url, model_dir=None, progress=True, file_name=None, save_dir=None +): + return get_cached_url_path(url, category="facexlib") + + +@lru_cache(maxsize=1) +def patch_download_function_in_facexlib_modules(): + """Replaces the custom weights downloaded with the standard imaginairy one.""" + import imaginairy.vendored.facexlib.utils.misc + from imaginairy.vendored.facexlib import ( + alignment, + assessment, + detection, + headpose, + matting, + parsing, + recognition, + tracking, + visualization, + ) + + modules = [ + alignment, + assessment, + detection, + headpose, + matting, + parsing, + recognition, + tracking, + visualization, + imaginairy.vendored.facexlib.utils.misc, + ] + for m in modules: + m.load_file_from_url = load_file_from_url + + +patch_download_function_in_facexlib_modules() + + @lru_cache def codeformer_model(): model = CodeFormer( diff --git a/imaginairy/img_processors/openpose.py b/imaginairy/img_processors/openpose.py index 897611f..19febad 100644 --- a/imaginairy/img_processors/openpose.py +++ b/imaginairy/img_processors/openpose.py @@ -5,7 +5,6 @@ from collections import OrderedDict from functools import lru_cache import cv2 -import matplotlib as mpl import numpy as np import torch from scipy.ndimage.filters import gaussian_filter @@ -125,6 +124,8 @@ def draw_bodypose(canvas, candidate, subset): # image drawed by opencv is not good. def draw_handpose(canvas, all_hand_peaks, show_number=False): + import matplotlib as mpl + edges = [ [0, 1], [1, 2], diff --git a/imaginairy/modules/sgm/autoencoding/losses/discriminator_loss.py b/imaginairy/modules/sgm/autoencoding/losses/discriminator_loss.py index b14f5f9..130c4ad 100644 --- a/imaginairy/modules/sgm/autoencoding/losses/discriminator_loss.py +++ b/imaginairy/modules/sgm/autoencoding/losses/discriminator_loss.py @@ -7,7 +7,6 @@ import torch import torch.nn as nn import torchvision from einops import rearrange -from matplotlib import colormaps, pyplot as plt from imaginairy.modules.sgm.autoencoding.lpips.loss.lpips import LPIPS from imaginairy.modules.sgm.autoencoding.lpips.model.model import weights_init @@ -98,6 +97,8 @@ class GeneralLPIPSWithDiscriminator(nn.Module): def log_images( self, inputs: torch.Tensor, reconstructions: torch.Tensor ) -> Dict[str, torch.Tensor]: + from matplotlib import colormaps, pyplot as plt + # calc logits of real/fake logits_real = self.discriminator(inputs.contiguous().detach()) if len(logits_real.shape) < 4: diff --git a/imaginairy/vendored/facexlib/LICENSE b/imaginairy/vendored/facexlib/LICENSE new file mode 100644 index 0000000..f4cf6a7 --- /dev/null +++ b/imaginairy/vendored/facexlib/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Xintao Wang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/imaginairy/vendored/facexlib/__init__.py b/imaginairy/vendored/facexlib/__init__.py new file mode 100644 index 0000000..39181bf --- /dev/null +++ b/imaginairy/vendored/facexlib/__init__.py @@ -0,0 +1,7 @@ +# flake8: noqa +from .alignment import * +from .detection import * +from .recognition import * +from .tracking import * +from .utils import * +from .visualization import * diff --git a/imaginairy/vendored/facexlib/alignment/README.md b/imaginairy/vendored/facexlib/alignment/README.md new file mode 100644 index 0000000..754c03e --- /dev/null +++ b/imaginairy/vendored/facexlib/alignment/README.md @@ -0,0 +1,20 @@ + +## Landmarks + +- 5 landmarks + +

+ +

+ +- 68 landmarks + +

+ +

+ +- 98 landmarks + +

+ +

diff --git a/imaginairy/vendored/facexlib/alignment/__init__.py b/imaginairy/vendored/facexlib/alignment/__init__.py new file mode 100644 index 0000000..129a456 --- /dev/null +++ b/imaginairy/vendored/facexlib/alignment/__init__.py @@ -0,0 +1,22 @@ +import torch + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .awing_arch import FAN +from .convert_98_to_68_landmarks import landmark_98_to_68 + +__all__ = ['FAN', 'landmark_98_to_68'] + + +def init_alignment_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'awing_fan': + model = FAN(num_modules=4, num_landmarks=98, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/alignment_WFLW_4HG.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + model.load_state_dict(torch.load(model_path)['state_dict'], strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/alignment/awing_arch.py b/imaginairy/vendored/facexlib/alignment/awing_arch.py new file mode 100644 index 0000000..cd56561 --- /dev/null +++ b/imaginairy/vendored/facexlib/alignment/awing_arch.py @@ -0,0 +1,378 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def calculate_points(heatmaps): + # change heatmaps to landmarks + B, N, H, W = heatmaps.shape + HW = H * W + BN_range = np.arange(B * N) + + heatline = heatmaps.reshape(B, N, HW) + indexes = np.argmax(heatline, axis=2) + + preds = np.stack((indexes % W, indexes // W), axis=2) + preds = preds.astype(np.float, copy=False) + + inr = indexes.ravel() + + heatline = heatline.reshape(B * N, HW) + x_up = heatline[BN_range, inr + 1] + x_down = heatline[BN_range, inr - 1] + # y_up = heatline[BN_range, inr + W] + + if any((inr + W) >= 4096): + y_up = heatline[BN_range, 4095] + else: + y_up = heatline[BN_range, inr + W] + if any((inr - W) <= 0): + y_down = heatline[BN_range, 0] + else: + y_down = heatline[BN_range, inr - W] + + think_diff = np.sign(np.stack((x_up - x_down, y_up - y_down), axis=1)) + think_diff *= .25 + + preds += think_diff.reshape(B, N, 2) + preds += .5 + return preds + + +class AddCoordsTh(nn.Module): + + def __init__(self, x_dim=64, y_dim=64, with_r=False, with_boundary=False): + super(AddCoordsTh, self).__init__() + self.x_dim = x_dim + self.y_dim = y_dim + self.with_r = with_r + self.with_boundary = with_boundary + + def forward(self, input_tensor, heatmap=None): + """ + input_tensor: (batch, c, x_dim, y_dim) + """ + batch_size_tensor = input_tensor.shape[0] + + xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32, device=input_tensor.device) + xx_ones = xx_ones.unsqueeze(-1) + + xx_range = torch.arange(self.x_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0) + xx_range = xx_range.unsqueeze(1) + + xx_channel = torch.matmul(xx_ones.float(), xx_range.float()) + xx_channel = xx_channel.unsqueeze(-1) + + yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32, device=input_tensor.device) + yy_ones = yy_ones.unsqueeze(1) + + yy_range = torch.arange(self.y_dim, dtype=torch.int32, device=input_tensor.device).unsqueeze(0) + yy_range = yy_range.unsqueeze(-1) + + yy_channel = torch.matmul(yy_range.float(), yy_ones.float()) + yy_channel = yy_channel.unsqueeze(-1) + + xx_channel = xx_channel.permute(0, 3, 2, 1) + yy_channel = yy_channel.permute(0, 3, 2, 1) + + xx_channel = xx_channel / (self.x_dim - 1) + yy_channel = yy_channel / (self.y_dim - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1) + yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1) + + if self.with_boundary and heatmap is not None: + boundary_channel = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0) + + zero_tensor = torch.zeros_like(xx_channel) + xx_boundary_channel = torch.where(boundary_channel > 0.05, xx_channel, zero_tensor) + yy_boundary_channel = torch.where(boundary_channel > 0.05, yy_channel, zero_tensor) + if self.with_boundary and heatmap is not None: + xx_boundary_channel = xx_boundary_channel.to(input_tensor.device) + yy_boundary_channel = yy_boundary_channel.to(input_tensor.device) + ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1) + + if self.with_r: + rr = torch.sqrt(torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2)) + rr = rr / torch.max(rr) + ret = torch.cat([ret, rr], dim=1) + + if self.with_boundary and heatmap is not None: + ret = torch.cat([ret, xx_boundary_channel, yy_boundary_channel], dim=1) + return ret + + +class CoordConvTh(nn.Module): + """CoordConv layer as in the paper.""" + + def __init__(self, x_dim, y_dim, with_r, with_boundary, in_channels, first_one=False, *args, **kwargs): + super(CoordConvTh, self).__init__() + self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim, with_r=with_r, with_boundary=with_boundary) + in_channels += 2 + if with_r: + in_channels += 1 + if with_boundary and not first_one: + in_channels += 2 + self.conv = nn.Conv2d(in_channels=in_channels, *args, **kwargs) + + def forward(self, input_tensor, heatmap=None): + ret = self.addcoords(input_tensor, heatmap) + last_channel = ret[:, -2:, :, :] + ret = self.conv(ret) + return ret, last_channel + + +def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False, dilation=1): + '3x3 convolution with padding' + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=strd, padding=padding, bias=bias, dilation=dilation) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + # self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + # self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + + out = self.conv2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ConvBlock(nn.Module): + + def __init__(self, in_planes, out_planes): + super(ConvBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = conv3x3(in_planes, int(out_planes / 2)) + self.bn2 = nn.BatchNorm2d(int(out_planes / 2)) + self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4), padding=1, dilation=1) + self.bn3 = nn.BatchNorm2d(int(out_planes / 4)) + self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4), padding=1, dilation=1) + + if in_planes != out_planes: + self.downsample = nn.Sequential( + nn.BatchNorm2d(in_planes), + nn.ReLU(True), + nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, bias=False), + ) + else: + self.downsample = None + + def forward(self, x): + residual = x + + out1 = self.bn1(x) + out1 = F.relu(out1, True) + out1 = self.conv1(out1) + + out2 = self.bn2(out1) + out2 = F.relu(out2, True) + out2 = self.conv2(out2) + + out3 = self.bn3(out2) + out3 = F.relu(out3, True) + out3 = self.conv3(out3) + + out3 = torch.cat((out1, out2, out3), 1) + + if self.downsample is not None: + residual = self.downsample(residual) + + out3 += residual + + return out3 + + +class HourGlass(nn.Module): + + def __init__(self, num_modules, depth, num_features, first_one=False): + super(HourGlass, self).__init__() + self.num_modules = num_modules + self.depth = depth + self.features = num_features + self.coordconv = CoordConvTh( + x_dim=64, + y_dim=64, + with_r=True, + with_boundary=True, + in_channels=256, + first_one=first_one, + out_channels=256, + kernel_size=1, + stride=1, + padding=0) + self._generate_network(self.depth) + + def _generate_network(self, level): + self.add_module('b1_' + str(level), ConvBlock(256, 256)) + + self.add_module('b2_' + str(level), ConvBlock(256, 256)) + + if level > 1: + self._generate_network(level - 1) + else: + self.add_module('b2_plus_' + str(level), ConvBlock(256, 256)) + + self.add_module('b3_' + str(level), ConvBlock(256, 256)) + + def _forward(self, level, inp): + # Upper branch + up1 = inp + up1 = self._modules['b1_' + str(level)](up1) + + # Lower branch + low1 = F.avg_pool2d(inp, 2, stride=2) + low1 = self._modules['b2_' + str(level)](low1) + + if level > 1: + low2 = self._forward(level - 1, low1) + else: + low2 = low1 + low2 = self._modules['b2_plus_' + str(level)](low2) + + low3 = low2 + low3 = self._modules['b3_' + str(level)](low3) + + up2 = F.interpolate(low3, scale_factor=2, mode='nearest') + + return up1 + up2 + + def forward(self, x, heatmap): + x, last_channel = self.coordconv(x, heatmap) + return self._forward(self.depth, x), last_channel + + +class FAN(nn.Module): + + def __init__(self, num_modules=1, end_relu=False, gray_scale=False, num_landmarks=68, device='cuda'): + super(FAN, self).__init__() + self.device = device + self.num_modules = num_modules + self.gray_scale = gray_scale + self.end_relu = end_relu + self.num_landmarks = num_landmarks + + # Base part + if self.gray_scale: + self.conv1 = CoordConvTh( + x_dim=256, + y_dim=256, + with_r=True, + with_boundary=False, + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3) + else: + self.conv1 = CoordConvTh( + x_dim=256, + y_dim=256, + with_r=True, + with_boundary=False, + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.conv2 = ConvBlock(64, 128) + self.conv3 = ConvBlock(128, 128) + self.conv4 = ConvBlock(128, 256) + + # Stacking part + for hg_module in range(self.num_modules): + if hg_module == 0: + first_one = True + else: + first_one = False + self.add_module('m' + str(hg_module), HourGlass(1, 4, 256, first_one)) + self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256)) + self.add_module('conv_last' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) + self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256)) + self.add_module('l' + str(hg_module), nn.Conv2d(256, num_landmarks + 1, kernel_size=1, stride=1, padding=0)) + + if hg_module < self.num_modules - 1: + self.add_module('bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)) + self.add_module('al' + str(hg_module), + nn.Conv2d(num_landmarks + 1, 256, kernel_size=1, stride=1, padding=0)) + + def forward(self, x): + x, _ = self.conv1(x) + x = F.relu(self.bn1(x), True) + # x = F.relu(self.bn1(self.conv1(x)), True) + x = F.avg_pool2d(self.conv2(x), 2, stride=2) + x = self.conv3(x) + x = self.conv4(x) + + previous = x + + outputs = [] + boundary_channels = [] + tmp_out = None + for i in range(self.num_modules): + hg, boundary_channel = self._modules['m' + str(i)](previous, tmp_out) + + ll = hg + ll = self._modules['top_m_' + str(i)](ll) + + ll = F.relu(self._modules['bn_end' + str(i)](self._modules['conv_last' + str(i)](ll)), True) + + # Predict heatmaps + tmp_out = self._modules['l' + str(i)](ll) + if self.end_relu: + tmp_out = F.relu(tmp_out) # HACK: Added relu + outputs.append(tmp_out) + boundary_channels.append(boundary_channel) + + if i < self.num_modules - 1: + ll = self._modules['bl' + str(i)](ll) + tmp_out_ = self._modules['al' + str(i)](tmp_out) + previous = previous + ll + tmp_out_ + + return outputs, boundary_channels + + def get_landmarks(self, img): + H, W, _ = img.shape + offset = W / 64, H / 64, 0, 0 + + img = cv2.resize(img, (256, 256)) + inp = img[..., ::-1] + inp = torch.from_numpy(np.ascontiguousarray(inp.transpose((2, 0, 1)))).float() + inp = inp.to(self.device) + inp.div_(255.0).unsqueeze_(0) + + outputs, _ = self.forward(inp) + out = outputs[-1][:, :-1, :, :] + heatmaps = out.detach().cpu().numpy() + + pred = calculate_points(heatmaps).reshape(-1, 2) + + pred *= offset[:2] + pred += offset[-2:] + + return pred diff --git a/imaginairy/vendored/facexlib/alignment/convert_98_to_68_landmarks.py b/imaginairy/vendored/facexlib/alignment/convert_98_to_68_landmarks.py new file mode 100644 index 0000000..376f661 --- /dev/null +++ b/imaginairy/vendored/facexlib/alignment/convert_98_to_68_landmarks.py @@ -0,0 +1,82 @@ +import numpy as np + + +def load_txt_file(file_path): + """Load data or string from txt file.""" + + with open(file_path, 'r') as cfile: + content = cfile.readlines() + cfile.close() + content = [x.strip() for x in content] + num_lines = len(content) + return content, num_lines + + +def anno_parser(anno_path, num_pts, line_offset=0): + """Parse the annotation. + Args: + anno_path: path of anno file (suffix .txt) + num_pts: number of landmarks. + line_offset: first point starts, default: 0. + + Returns: + pts: num_pts x 2 (x, y) + """ + + data, _ = load_txt_file(anno_path) + n_points = num_pts + # read points coordinate. + pts = np.zeros((n_points, 2), dtype='float32') + for point_index in range(n_points): + try: + pts_list = data[point_index + line_offset].split(',') + pts[point_index, 0] = float(pts_list[0]) + pts[point_index, 1] = float(pts_list[1]) + except ValueError: + print(f'Error in loading points in {anno_path}') + return pts + + +def landmark_98_to_68(landmark_98): + """Transfer 98 landmark positions to 68 landmark positions. + Args: + landmark_98(numpy array): Polar coordinates of 98 landmarks, (98, 2) + Returns: + landmark_68(numpy array): Polar coordinates of 98 landmarks, (68, 2) + """ + + landmark_68 = np.zeros((68, 2), dtype='float32') + # cheek + for i in range(0, 33): + if i % 2 == 0: + landmark_68[int(i / 2), :] = landmark_98[i, :] + # nose + for i in range(51, 60): + landmark_68[i - 24, :] = landmark_98[i, :] + # mouth + for i in range(76, 96): + landmark_68[i - 28, :] = landmark_98[i, :] + # left eyebrow + landmark_68[17, :] = landmark_98[33, :] + landmark_68[18, :] = (landmark_98[34, :] + landmark_98[41, :]) / 2 + landmark_68[19, :] = (landmark_98[35, :] + landmark_98[40, :]) / 2 + landmark_68[20, :] = (landmark_98[36, :] + landmark_98[39, :]) / 2 + landmark_68[21, :] = (landmark_98[37, :] + landmark_98[38, :]) / 2 + # right eyebrow + landmark_68[22, :] = (landmark_98[42, :] + landmark_98[50, :]) / 2 + landmark_68[23, :] = (landmark_98[43, :] + landmark_98[49, :]) / 2 + landmark_68[24, :] = (landmark_98[44, :] + landmark_98[48, :]) / 2 + landmark_68[25, :] = (landmark_98[45, :] + landmark_98[47, :]) / 2 + landmark_68[26, :] = landmark_98[46, :] + # left eye + LUT_landmark_68_left_eye = [36, 37, 38, 39, 40, 41] + LUT_landmark_98_left_eye = [60, 61, 63, 64, 65, 67] + for idx, landmark_98_index in enumerate(LUT_landmark_98_left_eye): + landmark_68[LUT_landmark_68_left_eye[idx], :] = landmark_98[landmark_98_index, :] + # right eye + LUT_landmark_68_right_eye = [42, 43, 44, 45, 46, 47] + LUT_landmark_98_right_eye = [68, 69, 71, 72, 73, 75] + for idx, landmark_98_index in enumerate(LUT_landmark_98_right_eye): + landmark_68[LUT_landmark_68_right_eye[idx], :] = landmark_98[landmark_98_index, :] + + return landmark_68 diff --git a/imaginairy/vendored/facexlib/assessment/__init__.py b/imaginairy/vendored/facexlib/assessment/__init__.py new file mode 100644 index 0000000..ba426fe --- /dev/null +++ b/imaginairy/vendored/facexlib/assessment/__init__.py @@ -0,0 +1,20 @@ +import torch + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .hyperiqa_net import HyperIQA + + +def init_assessment_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'hypernet': + model = HyperIQA(16, 112, 224, 112, 56, 28, 14, 7) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.0/assessment_hyperIQA.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + # load the pre-trained hypernet model + hypernet_model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + model.hypernet.load_state_dict((torch.load(hypernet_model_path, map_location=lambda storage, loc: storage))) + model = model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/assessment/hyperiqa_net.py b/imaginairy/vendored/facexlib/assessment/hyperiqa_net.py new file mode 100644 index 0000000..216fbac --- /dev/null +++ b/imaginairy/vendored/facexlib/assessment/hyperiqa_net.py @@ -0,0 +1,298 @@ +import torch as torch +import torch.nn as nn +from torch.nn import functional as F + + +class HyperIQA(nn.Module): + """ + Combine the hypernet and target network within a network. + """ + + def __init__(self, *args): + super(HyperIQA, self).__init__() + self.hypernet = HyperNet(*args) + + def forward(self, img): + net_params = self.hypernet(img) + # build the target network + target_net = TargetNet(net_params) + for param in target_net.parameters(): + param.requires_grad = False + # predict the face quality + pred = target_net(net_params['target_in_vec']) + return pred + + +class HyperNet(nn.Module): + """ + Hyper network for learning perceptual rules. + Args: + lda_out_channels: local distortion aware module output size. + hyper_in_channels: input feature channels for hyper network. + target_in_size: input vector size for target network. + target_fc(i)_size: fully connection layer size of target network. + feature_size: input feature map width/height for hyper network. + Note: + For size match, input args must satisfy: 'target_fc(i)_size * target_fc(i+1)_size' is divisible by 'feature_size ^ 2'. # noqa E501 + """ + + def __init__(self, lda_out_channels, hyper_in_channels, target_in_size, target_fc1_size, target_fc2_size, + target_fc3_size, target_fc4_size, feature_size): + super(HyperNet, self).__init__() + + self.hyperInChn = hyper_in_channels + self.target_in_size = target_in_size + self.f1 = target_fc1_size + self.f2 = target_fc2_size + self.f3 = target_fc3_size + self.f4 = target_fc4_size + self.feature_size = feature_size + + self.res = resnet50_backbone(lda_out_channels, target_in_size) + + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + + # Conv layers for resnet output features + self.conv1 = nn.Sequential( + nn.Conv2d(2048, 1024, 1, padding=(0, 0)), nn.ReLU(inplace=True), nn.Conv2d(1024, 512, 1, padding=(0, 0)), + nn.ReLU(inplace=True), nn.Conv2d(512, self.hyperInChn, 1, padding=(0, 0)), nn.ReLU(inplace=True)) + + # Hyper network part, conv for generating target fc weights, fc for generating target fc biases + self.fc1w_conv = nn.Conv2d( + self.hyperInChn, int(self.target_in_size * self.f1 / feature_size**2), 3, padding=(1, 1)) + self.fc1b_fc = nn.Linear(self.hyperInChn, self.f1) + + self.fc2w_conv = nn.Conv2d(self.hyperInChn, int(self.f1 * self.f2 / feature_size**2), 3, padding=(1, 1)) + self.fc2b_fc = nn.Linear(self.hyperInChn, self.f2) + + self.fc3w_conv = nn.Conv2d(self.hyperInChn, int(self.f2 * self.f3 / feature_size**2), 3, padding=(1, 1)) + self.fc3b_fc = nn.Linear(self.hyperInChn, self.f3) + + self.fc4w_conv = nn.Conv2d(self.hyperInChn, int(self.f3 * self.f4 / feature_size**2), 3, padding=(1, 1)) + self.fc4b_fc = nn.Linear(self.hyperInChn, self.f4) + + self.fc5w_fc = nn.Linear(self.hyperInChn, self.f4) + self.fc5b_fc = nn.Linear(self.hyperInChn, 1) + + def forward(self, img): + feature_size = self.feature_size + + res_out = self.res(img) + + # input vector for target net + target_in_vec = res_out['target_in_vec'].view(-1, self.target_in_size, 1, 1) + + # input features for hyper net + hyper_in_feat = self.conv1(res_out['hyper_in_feat']).view(-1, self.hyperInChn, feature_size, feature_size) + + # generating target net weights & biases + target_fc1w = self.fc1w_conv(hyper_in_feat).view(-1, self.f1, self.target_in_size, 1, 1) + target_fc1b = self.fc1b_fc(self.pool(hyper_in_feat).squeeze()).view(-1, self.f1) + + target_fc2w = self.fc2w_conv(hyper_in_feat).view(-1, self.f2, self.f1, 1, 1) + target_fc2b = self.fc2b_fc(self.pool(hyper_in_feat).squeeze()).view(-1, self.f2) + + target_fc3w = self.fc3w_conv(hyper_in_feat).view(-1, self.f3, self.f2, 1, 1) + target_fc3b = self.fc3b_fc(self.pool(hyper_in_feat).squeeze()).view(-1, self.f3) + + target_fc4w = self.fc4w_conv(hyper_in_feat).view(-1, self.f4, self.f3, 1, 1) + target_fc4b = self.fc4b_fc(self.pool(hyper_in_feat).squeeze()).view(-1, self.f4) + + target_fc5w = self.fc5w_fc(self.pool(hyper_in_feat).squeeze()).view(-1, 1, self.f4, 1, 1) + target_fc5b = self.fc5b_fc(self.pool(hyper_in_feat).squeeze()).view(-1, 1) + + out = {} + out['target_in_vec'] = target_in_vec + out['target_fc1w'] = target_fc1w + out['target_fc1b'] = target_fc1b + out['target_fc2w'] = target_fc2w + out['target_fc2b'] = target_fc2b + out['target_fc3w'] = target_fc3w + out['target_fc3b'] = target_fc3b + out['target_fc4w'] = target_fc4w + out['target_fc4b'] = target_fc4b + out['target_fc5w'] = target_fc5w + out['target_fc5b'] = target_fc5b + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNetBackbone(nn.Module): + + def __init__(self, lda_out_channels, in_chn, block, layers, num_classes=1000): + super(ResNetBackbone, self).__init__() + self.inplanes = 64 + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + # local distortion aware module + self.lda1_pool = nn.Sequential( + nn.Conv2d(256, 16, kernel_size=1, stride=1, padding=0, bias=False), + nn.AvgPool2d(7, stride=7), + ) + self.lda1_fc = nn.Linear(16 * 64, lda_out_channels) + + self.lda2_pool = nn.Sequential( + nn.Conv2d(512, 32, kernel_size=1, stride=1, padding=0, bias=False), + nn.AvgPool2d(7, stride=7), + ) + self.lda2_fc = nn.Linear(32 * 16, lda_out_channels) + + self.lda3_pool = nn.Sequential( + nn.Conv2d(1024, 64, kernel_size=1, stride=1, padding=0, bias=False), + nn.AvgPool2d(7, stride=7), + ) + self.lda3_fc = nn.Linear(64 * 4, lda_out_channels) + + self.lda4_pool = nn.AvgPool2d(7, stride=7) + self.lda4_fc = nn.Linear(2048, in_chn - lda_out_channels * 3) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + + # the same effect as lda operation in the paper, but save much more memory + lda_1 = self.lda1_fc(self.lda1_pool(x).view(x.size(0), -1)) + x = self.layer2(x) + lda_2 = self.lda2_fc(self.lda2_pool(x).view(x.size(0), -1)) + x = self.layer3(x) + lda_3 = self.lda3_fc(self.lda3_pool(x).view(x.size(0), -1)) + x = self.layer4(x) + lda_4 = self.lda4_fc(self.lda4_pool(x).view(x.size(0), -1)) + + vec = torch.cat((lda_1, lda_2, lda_3, lda_4), 1) + + out = {} + out['hyper_in_feat'] = x + out['target_in_vec'] = vec + + return out + + +def resnet50_backbone(lda_out_channels, in_chn, **kwargs): + """Constructs a ResNet-50 model_hyper.""" + model = ResNetBackbone(lda_out_channels, in_chn, Bottleneck, [3, 4, 6, 3], **kwargs) + return model + + +class TargetNet(nn.Module): + """ + Target network for quality prediction. + """ + + def __init__(self, paras): + super(TargetNet, self).__init__() + self.l1 = nn.Sequential( + TargetFC(paras['target_fc1w'], paras['target_fc1b']), + nn.Sigmoid(), + ) + self.l2 = nn.Sequential( + TargetFC(paras['target_fc2w'], paras['target_fc2b']), + nn.Sigmoid(), + ) + + self.l3 = nn.Sequential( + TargetFC(paras['target_fc3w'], paras['target_fc3b']), + nn.Sigmoid(), + ) + + self.l4 = nn.Sequential( + TargetFC(paras['target_fc4w'], paras['target_fc4b']), + nn.Sigmoid(), + TargetFC(paras['target_fc5w'], paras['target_fc5b']), + ) + + def forward(self, x): + q = self.l1(x) + # q = F.dropout(q) + q = self.l2(q) + q = self.l3(q) + q = self.l4(q).squeeze() + return q + + +class TargetFC(nn.Module): + """ + Fully connection operations for target net + Note: + Weights & biases are different for different images in a batch, + thus here we use group convolution for calculating images in a batch with individual weights & biases. + """ + + def __init__(self, weight, bias): + super(TargetFC, self).__init__() + self.weight = weight + self.bias = bias + + def forward(self, input_): + + input_re = input_.view(-1, input_.shape[0] * input_.shape[1], input_.shape[2], input_.shape[3]) + weight_re = self.weight.view(self.weight.shape[0] * self.weight.shape[1], self.weight.shape[2], + self.weight.shape[3], self.weight.shape[4]) + bias_re = self.bias.view(self.bias.shape[0] * self.bias.shape[1]) + out = F.conv2d(input=input_re, weight=weight_re, bias=bias_re, groups=self.weight.shape[0]) + + return out.view(input_.shape[0], self.weight.shape[1], input_.shape[2], input_.shape[3]) diff --git a/imaginairy/vendored/facexlib/detection/__init__.py b/imaginairy/vendored/facexlib/detection/__init__.py new file mode 100644 index 0000000..b9966e1 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/__init__.py @@ -0,0 +1,31 @@ +import torch +from copy import deepcopy + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .retinaface import RetinaFace + + +def init_detection_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'retinaface_resnet50': + model = RetinaFace(network_name='resnet50', half=half, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/detection_Resnet50_Final.pth' + elif model_name == 'retinaface_mobile0.25': + model = RetinaFace(network_name='mobile0.25', half=half, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/detection_mobilenet0.25_Final.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + + # TODO: clean pretrained model + load_net = torch.load(model_path, map_location=lambda storage, loc: storage) + # remove unnecessary 'module.' + for k, v in deepcopy(load_net).items(): + if k.startswith('module.'): + load_net[k[7:]] = v + load_net.pop(k) + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/detection/align_trans.py b/imaginairy/vendored/facexlib/detection/align_trans.py new file mode 100644 index 0000000..07f1eb3 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/align_trans.py @@ -0,0 +1,219 @@ +import cv2 +import numpy as np + +from .matlab_cp2tform import get_similarity_transform_for_cv2 + +# reference facial points, a list of coordinates (x,y) +REFERENCE_FACIAL_POINTS = [[30.29459953, 51.69630051], [65.53179932, 51.50139999], [48.02519989, 71.73660278], + [33.54930115, 92.3655014], [62.72990036, 92.20410156]] + +DEFAULT_CROP_SIZE = (96, 112) + + +class FaceWarpException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def get_reference_facial_points(output_size=None, inner_padding_factor=0.0, outer_padding=(0, 0), default_square=False): + """ + Function: + ---------- + get reference 5 key points according to crop settings: + 0. Set default crop_size: + if default_square: + crop_size = (112, 112) + else: + crop_size = (96, 112) + 1. Pad the crop_size by inner_padding_factor in each side; + 2. Resize crop_size into (output_size - outer_padding*2), + pad into output_size with outer_padding; + 3. Output reference_5point; + Parameters: + ---------- + @output_size: (w, h) or None + size of aligned face image + @inner_padding_factor: (w_factor, h_factor) + padding factor for inner (w, h) + @outer_padding: (w_pad, h_pad) + each row is a pair of coordinates (x, y) + @default_square: True or False + if True: + default crop_size = (112, 112) + else: + default crop_size = (96, 112); + !!! make sure, if output_size is not None: + (output_size - outer_padding) + = some_scale * (default crop_size * (1.0 + + inner_padding_factor)) + Returns: + ---------- + @reference_5point: 5x2 np.array + each row is a pair of transformed coordinates (x, y) + """ + + tmp_5pts = np.array(REFERENCE_FACIAL_POINTS) + tmp_crop_size = np.array(DEFAULT_CROP_SIZE) + + # 0) make the inner region a square + if default_square: + size_diff = max(tmp_crop_size) - tmp_crop_size + tmp_5pts += size_diff / 2 + tmp_crop_size += size_diff + + if (output_size and output_size[0] == tmp_crop_size[0] and output_size[1] == tmp_crop_size[1]): + + return tmp_5pts + + if (inner_padding_factor == 0 and outer_padding == (0, 0)): + if output_size is None: + return tmp_5pts + else: + raise FaceWarpException('No paddings to do, output_size must be None or {}'.format(tmp_crop_size)) + + # check output size + if not (0 <= inner_padding_factor <= 1.0): + raise FaceWarpException('Not (0 <= inner_padding_factor <= 1.0)') + + if ((inner_padding_factor > 0 or outer_padding[0] > 0 or outer_padding[1] > 0) and output_size is None): + output_size = tmp_crop_size * \ + (1 + inner_padding_factor * 2).astype(np.int32) + output_size += np.array(outer_padding) + if not (outer_padding[0] < output_size[0] and outer_padding[1] < output_size[1]): + raise FaceWarpException('Not (outer_padding[0] < output_size[0] and outer_padding[1] < output_size[1])') + + # 1) pad the inner region according inner_padding_factor + if inner_padding_factor > 0: + size_diff = tmp_crop_size * inner_padding_factor * 2 + tmp_5pts += size_diff / 2 + tmp_crop_size += np.round(size_diff).astype(np.int32) + + # 2) resize the padded inner region + size_bf_outer_pad = np.array(output_size) - np.array(outer_padding) * 2 + + if size_bf_outer_pad[0] * tmp_crop_size[1] != size_bf_outer_pad[1] * tmp_crop_size[0]: + raise FaceWarpException('Must have (output_size - outer_padding)' + '= some_scale * (crop_size * (1.0 + inner_padding_factor)') + + scale_factor = size_bf_outer_pad[0].astype(np.float32) / tmp_crop_size[0] + tmp_5pts = tmp_5pts * scale_factor + # size_diff = tmp_crop_size * (scale_factor - min(scale_factor)) + # tmp_5pts = tmp_5pts + size_diff / 2 + tmp_crop_size = size_bf_outer_pad + + # 3) add outer_padding to make output_size + reference_5point = tmp_5pts + np.array(outer_padding) + tmp_crop_size = output_size + + return reference_5point + + +def get_affine_transform_matrix(src_pts, dst_pts): + """ + Function: + ---------- + get affine transform matrix 'tfm' from src_pts to dst_pts + Parameters: + ---------- + @src_pts: Kx2 np.array + source points matrix, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points matrix, each row is a pair of coordinates (x, y) + Returns: + ---------- + @tfm: 2x3 np.array + transform matrix from src_pts to dst_pts + """ + + tfm = np.float32([[1, 0, 0], [0, 1, 0]]) + n_pts = src_pts.shape[0] + ones = np.ones((n_pts, 1), src_pts.dtype) + src_pts_ = np.hstack([src_pts, ones]) + dst_pts_ = np.hstack([dst_pts, ones]) + + A, res, rank, s = np.linalg.lstsq(src_pts_, dst_pts_) + + if rank == 3: + tfm = np.float32([[A[0, 0], A[1, 0], A[2, 0]], [A[0, 1], A[1, 1], A[2, 1]]]) + elif rank == 2: + tfm = np.float32([[A[0, 0], A[1, 0], 0], [A[0, 1], A[1, 1], 0]]) + + return tfm + + +def warp_and_crop_face(src_img, facial_pts, reference_pts=None, crop_size=(96, 112), align_type='smilarity'): + """ + Function: + ---------- + apply affine transform 'trans' to uv + Parameters: + ---------- + @src_img: 3x3 np.array + input image + @facial_pts: could be + 1)a list of K coordinates (x,y) + or + 2) Kx2 or 2xK np.array + each row or col is a pair of coordinates (x, y) + @reference_pts: could be + 1) a list of K coordinates (x,y) + or + 2) Kx2 or 2xK np.array + each row or col is a pair of coordinates (x, y) + or + 3) None + if None, use default reference facial points + @crop_size: (w, h) + output face image size + @align_type: transform type, could be one of + 1) 'similarity': use similarity transform + 2) 'cv2_affine': use the first 3 points to do affine transform, + by calling cv2.getAffineTransform() + 3) 'affine': use all points to do affine transform + Returns: + ---------- + @face_img: output face image with size (w, h) = @crop_size + """ + + if reference_pts is None: + if crop_size[0] == 96 and crop_size[1] == 112: + reference_pts = REFERENCE_FACIAL_POINTS + else: + default_square = False + inner_padding_factor = 0 + outer_padding = (0, 0) + output_size = crop_size + + reference_pts = get_reference_facial_points(output_size, inner_padding_factor, outer_padding, + default_square) + + ref_pts = np.float32(reference_pts) + ref_pts_shp = ref_pts.shape + if max(ref_pts_shp) < 3 or min(ref_pts_shp) != 2: + raise FaceWarpException('reference_pts.shape must be (K,2) or (2,K) and K>2') + + if ref_pts_shp[0] == 2: + ref_pts = ref_pts.T + + src_pts = np.float32(facial_pts) + src_pts_shp = src_pts.shape + if max(src_pts_shp) < 3 or min(src_pts_shp) != 2: + raise FaceWarpException('facial_pts.shape must be (K,2) or (2,K) and K>2') + + if src_pts_shp[0] == 2: + src_pts = src_pts.T + + if src_pts.shape != ref_pts.shape: + raise FaceWarpException('facial_pts and reference_pts must have the same shape') + + if align_type == 'cv2_affine': + tfm = cv2.getAffineTransform(src_pts[0:3], ref_pts[0:3]) + elif align_type == 'affine': + tfm = get_affine_transform_matrix(src_pts, ref_pts) + else: + tfm = get_similarity_transform_for_cv2(src_pts, ref_pts) + + face_img = cv2.warpAffine(src_img, tfm, (crop_size[0], crop_size[1])) + + return face_img diff --git a/imaginairy/vendored/facexlib/detection/matlab_cp2tform.py b/imaginairy/vendored/facexlib/detection/matlab_cp2tform.py new file mode 100644 index 0000000..b2a8b54 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/matlab_cp2tform.py @@ -0,0 +1,317 @@ +import numpy as np +from numpy.linalg import inv, lstsq +from numpy.linalg import matrix_rank as rank +from numpy.linalg import norm + + +class MatlabCp2tormException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def tformfwd(trans, uv): + """ + Function: + ---------- + apply affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of transformed coordinates (x, y) + """ + uv = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy = np.dot(uv, trans) + xy = xy[:, 0:-1] + return xy + + +def tforminv(trans, uv): + """ + Function: + ---------- + apply the inverse of affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of inverse-transformed coordinates (x, y) + """ + Tinv = inv(trans) + xy = tformfwd(Tinv, uv) + return xy + + +def findNonreflectiveSimilarity(uv, xy, options=None): + options = {'K': 2} + + K = options['K'] + M = xy.shape[0] + x = xy[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + y = xy[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + + tmp1 = np.hstack((x, y, np.ones((M, 1)), np.zeros((M, 1)))) + tmp2 = np.hstack((y, -x, np.zeros((M, 1)), np.ones((M, 1)))) + X = np.vstack((tmp1, tmp2)) + + u = uv[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + v = uv[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + U = np.vstack((u, v)) + + # We know that X * r = U + if rank(X) >= 2 * K: + r, _, _, _ = lstsq(X, U, rcond=-1) + r = np.squeeze(r) + else: + raise Exception('cp2tform:twoUniquePointsReq') + sc = r[0] + ss = r[1] + tx = r[2] + ty = r[3] + + Tinv = np.array([[sc, -ss, 0], [ss, sc, 0], [tx, ty, 1]]) + T = inv(Tinv) + T[:, 2] = np.array([0, 0, 1]) + + return T, Tinv + + +def findSimilarity(uv, xy, options=None): + options = {'K': 2} + + # uv = np.array(uv) + # xy = np.array(xy) + + # Solve for trans1 + trans1, trans1_inv = findNonreflectiveSimilarity(uv, xy, options) + + # Solve for trans2 + + # manually reflect the xy data across the Y-axis + xyR = xy + xyR[:, 0] = -1 * xyR[:, 0] + + trans2r, trans2r_inv = findNonreflectiveSimilarity(uv, xyR, options) + + # manually reflect the tform to undo the reflection done on xyR + TreflectY = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + trans2 = np.dot(trans2r, TreflectY) + + # Figure out if trans1 or trans2 is better + xy1 = tformfwd(trans1, uv) + norm1 = norm(xy1 - xy) + + xy2 = tformfwd(trans2, uv) + norm2 = norm(xy2 - xy) + + if norm1 <= norm2: + return trans1, trans1_inv + else: + trans2_inv = inv(trans2) + return trans2, trans2_inv + + +def get_similarity_transform(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'trans': + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y, 1] = [u, v, 1] * trans + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + @reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + trans_inv: 3x3 np.array + inverse of trans, transform matrix from xy to uv + """ + + if reflective: + trans, trans_inv = findSimilarity(src_pts, dst_pts) + else: + trans, trans_inv = findNonreflectiveSimilarity(src_pts, dst_pts) + + return trans, trans_inv + + +def cvt_tform_mat_for_cv2(trans): + """ + Function: + ---------- + Convert Transform Matrix 'trans' into 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + cv2_trans = trans[:, 0:2].T + + return cv2_trans + + +def get_similarity_transform_for_cv2(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + trans, trans_inv = get_similarity_transform(src_pts, dst_pts, reflective) + cv2_trans = cvt_tform_mat_for_cv2(trans) + + return cv2_trans + + +if __name__ == '__main__': + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + # In Matlab, run: + # + # uv = [u'; v']; + # xy = [x'; y']; + # tform_sim=cp2tform(uv,xy,'similarity'); + # + # trans = tform_sim.tdata.T + # ans = + # -0.0764 -1.6190 0 + # 1.6190 -0.0764 0 + # -3.2156 0.0290 1.0000 + # trans_inv = tform_sim.tdata.Tinv + # ans = + # + # -0.0291 0.6163 0 + # -0.6163 -0.0291 0 + # -0.0756 1.9826 1.0000 + # xy_m=tformfwd(tform_sim, u,v) + # + # xy_m = + # + # -3.2156 0.0290 + # 1.1833 -9.9143 + # 5.0323 2.8853 + # uv_m=tforminv(tform_sim, x,y) + # + # uv_m = + # + # 0.5698 1.3953 + # 6.0872 2.2733 + # -2.6570 4.3314 + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + uv = np.array((u, v)).T + xy = np.array((x, y)).T + + print('\n--->uv:') + print(uv) + print('\n--->xy:') + print(xy) + + trans, trans_inv = get_similarity_transform(uv, xy) + + print('\n--->trans matrix:') + print(trans) + + print('\n--->trans_inv matrix:') + print(trans_inv) + + print('\n---> apply transform to uv') + print('\nxy_m = uv_augmented * trans') + uv_aug = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy_m = np.dot(uv_aug, trans) + print(xy_m) + + print('\nxy_m = tformfwd(trans, uv)') + xy_m = tformfwd(trans, uv) + print(xy_m) + + print('\n---> apply inverse transform to xy') + print('\nuv_m = xy_augmented * trans_inv') + xy_aug = np.hstack((xy, np.ones((xy.shape[0], 1)))) + uv_m = np.dot(xy_aug, trans_inv) + print(uv_m) + + print('\nuv_m = tformfwd(trans_inv, xy)') + uv_m = tformfwd(trans_inv, xy) + print(uv_m) + + uv_m = tforminv(trans, xy) + print('\nuv_m = tforminv(trans, xy)') + print(uv_m) diff --git a/imaginairy/vendored/facexlib/detection/retinaface.py b/imaginairy/vendored/facexlib/detection/retinaface.py new file mode 100644 index 0000000..c49a445 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/retinaface.py @@ -0,0 +1,366 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from torchvision.models._utils import IntermediateLayerGetter as IntermediateLayerGetter + +from imaginairy.vendored.facexlib.detection.align_trans import get_reference_facial_points, warp_and_crop_face +from imaginairy.vendored.facexlib.detection.retinaface_net import FPN, SSH, MobileNetV1, make_bbox_head, make_class_head, make_landmark_head +from imaginairy.vendored.facexlib.detection.retinaface_utils import (PriorBox, batched_decode, batched_decode_landm, decode, decode_landm, + py_cpu_nms) + + +def generate_config(network_name): + + cfg_mnet = { + 'name': 'mobilenet0.25', + 'min_sizes': [[16, 32], [64, 128], [256, 512]], + 'steps': [8, 16, 32], + 'variance': [0.1, 0.2], + 'clip': False, + 'loc_weight': 2.0, + 'gpu_train': True, + 'batch_size': 32, + 'ngpu': 1, + 'epoch': 250, + 'decay1': 190, + 'decay2': 220, + 'image_size': 640, + 'return_layers': { + 'stage1': 1, + 'stage2': 2, + 'stage3': 3 + }, + 'in_channel': 32, + 'out_channel': 64 + } + + cfg_re50 = { + 'name': 'Resnet50', + 'min_sizes': [[16, 32], [64, 128], [256, 512]], + 'steps': [8, 16, 32], + 'variance': [0.1, 0.2], + 'clip': False, + 'loc_weight': 2.0, + 'gpu_train': True, + 'batch_size': 24, + 'ngpu': 4, + 'epoch': 100, + 'decay1': 70, + 'decay2': 90, + 'image_size': 840, + 'return_layers': { + 'layer2': 1, + 'layer3': 2, + 'layer4': 3 + }, + 'in_channel': 256, + 'out_channel': 256 + } + + if network_name == 'mobile0.25': + return cfg_mnet + elif network_name == 'resnet50': + return cfg_re50 + else: + raise NotImplementedError(f'network_name={network_name}') + + +class RetinaFace(nn.Module): + + def __init__(self, network_name='resnet50', half=False, phase='test', device=None): + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device + + super(RetinaFace, self).__init__() + self.half_inference = half + cfg = generate_config(network_name) + self.backbone = cfg['name'] + + self.model_name = f'retinaface_{network_name}' + self.cfg = cfg + self.phase = phase + self.target_size, self.max_size = 1600, 2150 + self.resize, self.scale, self.scale1 = 1., None, None + self.mean_tensor = torch.tensor([[[[104.]], [[117.]], [[123.]]]], device=self.device) + self.reference = get_reference_facial_points(default_square=True) + # Build network. + backbone = None + if cfg['name'] == 'mobilenet0.25': + backbone = MobileNetV1() + self.body = IntermediateLayerGetter(backbone, cfg['return_layers']) + elif cfg['name'] == 'Resnet50': + import torchvision.models as models + backbone = models.resnet50(pretrained=False) + self.body = IntermediateLayerGetter(backbone, cfg['return_layers']) + + in_channels_stage2 = cfg['in_channel'] + in_channels_list = [ + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ] + + out_channels = cfg['out_channel'] + self.fpn = FPN(in_channels_list, out_channels) + self.ssh1 = SSH(out_channels, out_channels) + self.ssh2 = SSH(out_channels, out_channels) + self.ssh3 = SSH(out_channels, out_channels) + + self.ClassHead = make_class_head(fpn_num=3, inchannels=cfg['out_channel']) + self.BboxHead = make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) + self.LandmarkHead = make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) + + self.to(self.device) + self.eval() + if self.half_inference: + self.half() + + def forward(self, inputs): + out = self.body(inputs) + + if self.backbone == 'mobilenet0.25' or self.backbone == 'Resnet50': + out = list(out.values()) + # FPN + fpn = self.fpn(out) + + # SSH + feature1 = self.ssh1(fpn[0]) + feature2 = self.ssh2(fpn[1]) + feature3 = self.ssh3(fpn[2]) + features = [feature1, feature2, feature3] + + bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) + classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1) + tmp = [self.LandmarkHead[i](feature) for i, feature in enumerate(features)] + ldm_regressions = (torch.cat(tmp, dim=1)) + + if self.phase == 'train': + output = (bbox_regressions, classifications, ldm_regressions) + else: + output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions) + return output + + def __detect_faces(self, inputs): + # get scale + height, width = inputs.shape[2:] + self.scale = torch.tensor([width, height, width, height], dtype=torch.float32, device=self.device) + tmp = [width, height, width, height, width, height, width, height, width, height] + self.scale1 = torch.tensor(tmp, dtype=torch.float32, device=self.device) + + # forawrd + inputs = inputs.to(self.device) + if self.half_inference: + inputs = inputs.half() + loc, conf, landmarks = self(inputs) + + # get priorbox + priorbox = PriorBox(self.cfg, image_size=inputs.shape[2:]) + priors = priorbox.forward().to(self.device) + + return loc, conf, landmarks, priors + + # single image detection + def transform(self, image, use_origin_size): + # convert to opencv format + if isinstance(image, Image.Image): + image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) + image = image.astype(np.float32) + + # testing scale + im_size_min = np.min(image.shape[0:2]) + im_size_max = np.max(image.shape[0:2]) + resize = float(self.target_size) / float(im_size_min) + + # prevent bigger axis from being more than max_size + if np.round(resize * im_size_max) > self.max_size: + resize = float(self.max_size) / float(im_size_max) + resize = 1 if use_origin_size else resize + + # resize + if resize != 1: + image = cv2.resize(image, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) + + # convert to torch.tensor format + # image -= (104, 117, 123) + image = image.transpose(2, 0, 1) + image = torch.from_numpy(image).unsqueeze(0) + + return image, resize + + def detect_faces( + self, + image, + conf_threshold=0.8, + nms_threshold=0.4, + use_origin_size=True, + ): + image, self.resize = self.transform(image, use_origin_size) + image = image.to(self.device) + if self.half_inference: + image = image.half() + image = image - self.mean_tensor + + loc, conf, landmarks, priors = self.__detect_faces(image) + + boxes = decode(loc.data.squeeze(0), priors.data, self.cfg['variance']) + boxes = boxes * self.scale / self.resize + boxes = boxes.cpu().numpy() + + scores = conf.squeeze(0).data.cpu().numpy()[:, 1] + + landmarks = decode_landm(landmarks.squeeze(0), priors, self.cfg['variance']) + landmarks = landmarks * self.scale1 / self.resize + landmarks = landmarks.cpu().numpy() + + # ignore low scores + inds = np.where(scores > conf_threshold)[0] + boxes, landmarks, scores = boxes[inds], landmarks[inds], scores[inds] + + # sort + order = scores.argsort()[::-1] + boxes, landmarks, scores = boxes[order], landmarks[order], scores[order] + + # do NMS + bounding_boxes = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) + keep = py_cpu_nms(bounding_boxes, nms_threshold) + bounding_boxes, landmarks = bounding_boxes[keep, :], landmarks[keep] + # self.t['forward_pass'].toc() + # print(self.t['forward_pass'].average_time) + # import sys + # sys.stdout.flush() + return np.concatenate((bounding_boxes, landmarks), axis=1) + + def __align_multi(self, image, boxes, landmarks, limit=None): + + if len(boxes) < 1: + return [], [] + + if limit: + boxes = boxes[:limit] + landmarks = landmarks[:limit] + + faces = [] + for landmark in landmarks: + facial5points = [[landmark[2 * j], landmark[2 * j + 1]] for j in range(5)] + + warped_face = warp_and_crop_face(np.array(image), facial5points, self.reference, crop_size=(112, 112)) + faces.append(warped_face) + + return np.concatenate((boxes, landmarks), axis=1), faces + + def align_multi(self, img, conf_threshold=0.8, limit=None): + + rlt = self.detect_faces(img, conf_threshold=conf_threshold) + boxes, landmarks = rlt[:, 0:5], rlt[:, 5:] + + return self.__align_multi(img, boxes, landmarks, limit) + + # batched detection + def batched_transform(self, frames, use_origin_size): + """ + Arguments: + frames: a list of PIL.Image, or torch.Tensor(shape=[n, h, w, c], + type=np.float32, BGR format). + use_origin_size: whether to use origin size. + """ + from_PIL = True if isinstance(frames[0], Image.Image) else False + + # convert to opencv format + if from_PIL: + frames = [cv2.cvtColor(np.asarray(frame), cv2.COLOR_RGB2BGR) for frame in frames] + frames = np.asarray(frames, dtype=np.float32) + + # testing scale + im_size_min = np.min(frames[0].shape[0:2]) + im_size_max = np.max(frames[0].shape[0:2]) + resize = float(self.target_size) / float(im_size_min) + + # prevent bigger axis from being more than max_size + if np.round(resize * im_size_max) > self.max_size: + resize = float(self.max_size) / float(im_size_max) + resize = 1 if use_origin_size else resize + + # resize + if resize != 1: + if not from_PIL: + frames = F.interpolate(frames, scale_factor=resize) + else: + frames = [ + cv2.resize(frame, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) + for frame in frames + ] + + # convert to torch.tensor format + if not from_PIL: + frames = frames.transpose(1, 2).transpose(1, 3).contiguous() + else: + frames = frames.transpose((0, 3, 1, 2)) + frames = torch.from_numpy(frames) + + return frames, resize + + def batched_detect_faces(self, frames, conf_threshold=0.8, nms_threshold=0.4, use_origin_size=True): + """ + Arguments: + frames: a list of PIL.Image, or np.array(shape=[n, h, w, c], + type=np.uint8, BGR format). + conf_threshold: confidence threshold. + nms_threshold: nms threshold. + use_origin_size: whether to use origin size. + Returns: + final_bounding_boxes: list of np.array ([n_boxes, 5], + type=np.float32). + final_landmarks: list of np.array ([n_boxes, 10], type=np.float32). + """ + # self.t['forward_pass'].tic() + frames, self.resize = self.batched_transform(frames, use_origin_size) + frames = frames.to(self.device) + frames = frames - self.mean_tensor + + b_loc, b_conf, b_landmarks, priors = self.__detect_faces(frames) + + final_bounding_boxes, final_landmarks = [], [] + + # decode + priors = priors.unsqueeze(0) + b_loc = batched_decode(b_loc, priors, self.cfg['variance']) * self.scale / self.resize + b_landmarks = batched_decode_landm(b_landmarks, priors, self.cfg['variance']) * self.scale1 / self.resize + b_conf = b_conf[:, :, 1] + + # index for selection + b_indice = b_conf > conf_threshold + + # concat + b_loc_and_conf = torch.cat((b_loc, b_conf.unsqueeze(-1)), dim=2).float() + + for pred, landm, inds in zip(b_loc_and_conf, b_landmarks, b_indice): + + # ignore low scores + pred, landm = pred[inds, :], landm[inds, :] + if pred.shape[0] == 0: + final_bounding_boxes.append(np.array([], dtype=np.float32)) + final_landmarks.append(np.array([], dtype=np.float32)) + continue + + # sort + # order = score.argsort(descending=True) + # box, landm, score = box[order], landm[order], score[order] + + # to CPU + bounding_boxes, landm = pred.cpu().numpy(), landm.cpu().numpy() + + # NMS + keep = py_cpu_nms(bounding_boxes, nms_threshold) + bounding_boxes, landmarks = bounding_boxes[keep, :], landm[keep] + + # append + final_bounding_boxes.append(bounding_boxes) + final_landmarks.append(landmarks) + # self.t['forward_pass'].toc(average=True) + # self.batch_time += self.t['forward_pass'].diff + # self.total_frame += len(frames) + # print(self.batch_time / self.total_frame) + + return final_bounding_boxes, final_landmarks diff --git a/imaginairy/vendored/facexlib/detection/retinaface_net.py b/imaginairy/vendored/facexlib/detection/retinaface_net.py new file mode 100644 index 0000000..ab6aa82 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/retinaface_net.py @@ -0,0 +1,196 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def conv_bn(inp, oup, stride=1, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_bn_no_relu(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + ) + + +def conv_bn1X1(inp, oup, stride, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_dw(inp, oup, stride, leaky=0.1): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + ) + + +class SSH(nn.Module): + + def __init__(self, in_channel, out_channel): + super(SSH, self).__init__() + assert out_channel % 4 == 0 + leaky = 0 + if (out_channel <= 64): + leaky = 0.1 + self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1) + + self.conv5X5_1 = conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky) + self.conv5X5_2 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1) + + self.conv7X7_2 = conv_bn(out_channel // 4, out_channel // 4, stride=1, leaky=leaky) + self.conv7x7_3 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1) + + def forward(self, input): + conv3X3 = self.conv3X3(input) + + conv5X5_1 = self.conv5X5_1(input) + conv5X5 = self.conv5X5_2(conv5X5_1) + + conv7X7_2 = self.conv7X7_2(conv5X5_1) + conv7X7 = self.conv7x7_3(conv7X7_2) + + out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) + out = F.relu(out) + return out + + +class FPN(nn.Module): + + def __init__(self, in_channels_list, out_channels): + super(FPN, self).__init__() + leaky = 0 + if (out_channels <= 64): + leaky = 0.1 + self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky) + self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky) + self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky) + + self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) + self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) + + def forward(self, input): + # names = list(input.keys()) + # input = list(input.values()) + + output1 = self.output1(input[0]) + output2 = self.output2(input[1]) + output3 = self.output3(input[2]) + + up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode='nearest') + output2 = output2 + up3 + output2 = self.merge2(output2) + + up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode='nearest') + output1 = output1 + up2 + output1 = self.merge1(output1) + + out = [output1, output2, output3] + return out + + +class MobileNetV1(nn.Module): + + def __init__(self): + super(MobileNetV1, self).__init__() + self.stage1 = nn.Sequential( + conv_bn(3, 8, 2, leaky=0.1), # 3 + conv_dw(8, 16, 1), # 7 + conv_dw(16, 32, 2), # 11 + conv_dw(32, 32, 1), # 19 + conv_dw(32, 64, 2), # 27 + conv_dw(64, 64, 1), # 43 + ) + self.stage2 = nn.Sequential( + conv_dw(64, 128, 2), # 43 + 16 = 59 + conv_dw(128, 128, 1), # 59 + 32 = 91 + conv_dw(128, 128, 1), # 91 + 32 = 123 + conv_dw(128, 128, 1), # 123 + 32 = 155 + conv_dw(128, 128, 1), # 155 + 32 = 187 + conv_dw(128, 128, 1), # 187 + 32 = 219 + ) + self.stage3 = nn.Sequential( + conv_dw(128, 256, 2), # 219 +3 2 = 241 + conv_dw(256, 256, 1), # 241 + 64 = 301 + ) + self.avg = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, 1000) + + def forward(self, x): + x = self.stage1(x) + x = self.stage2(x) + x = self.stage3(x) + x = self.avg(x) + # x = self.model(x) + x = x.view(-1, 256) + x = self.fc(x) + return x + + +class ClassHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(ClassHead, self).__init__() + self.num_anchors = num_anchors + self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors * 2, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 2) + + +class BboxHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(BboxHead, self).__init__() + self.conv1x1 = nn.Conv2d(inchannels, num_anchors * 4, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 4) + + +class LandmarkHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(LandmarkHead, self).__init__() + self.conv1x1 = nn.Conv2d(inchannels, num_anchors * 10, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 10) + + +def make_class_head(fpn_num=3, inchannels=64, anchor_num=2): + classhead = nn.ModuleList() + for i in range(fpn_num): + classhead.append(ClassHead(inchannels, anchor_num)) + return classhead + + +def make_bbox_head(fpn_num=3, inchannels=64, anchor_num=2): + bboxhead = nn.ModuleList() + for i in range(fpn_num): + bboxhead.append(BboxHead(inchannels, anchor_num)) + return bboxhead + + +def make_landmark_head(fpn_num=3, inchannels=64, anchor_num=2): + landmarkhead = nn.ModuleList() + for i in range(fpn_num): + landmarkhead.append(LandmarkHead(inchannels, anchor_num)) + return landmarkhead diff --git a/imaginairy/vendored/facexlib/detection/retinaface_utils.py b/imaginairy/vendored/facexlib/detection/retinaface_utils.py new file mode 100644 index 0000000..8c35775 --- /dev/null +++ b/imaginairy/vendored/facexlib/detection/retinaface_utils.py @@ -0,0 +1,421 @@ +import numpy as np +import torch +import torchvision +from itertools import product as product +from math import ceil + + +class PriorBox(object): + + def __init__(self, cfg, image_size=None, phase='train'): + super(PriorBox, self).__init__() + self.min_sizes = cfg['min_sizes'] + self.steps = cfg['steps'] + self.clip = cfg['clip'] + self.image_size = image_size + self.feature_maps = [[ceil(self.image_size[0] / step), ceil(self.image_size[1] / step)] for step in self.steps] + self.name = 's' + + def forward(self): + anchors = [] + for k, f in enumerate(self.feature_maps): + min_sizes = self.min_sizes[k] + for i, j in product(range(f[0]), range(f[1])): + for min_size in min_sizes: + s_kx = min_size / self.image_size[1] + s_ky = min_size / self.image_size[0] + dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] + dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] + for cy, cx in product(dense_cy, dense_cx): + anchors += [cx, cy, s_kx, s_ky] + + # back to torch land + output = torch.Tensor(anchors).view(-1, 4) + if self.clip: + output.clamp_(max=1, min=0) + return output + + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + keep = torchvision.ops.nms( + boxes=torch.Tensor(dets[:, :4]), + scores=torch.Tensor(dets[:, 4]), + iou_threshold=thresh, + ) + + return list(keep) + + +def point_form(boxes): + """ Convert prior_boxes to (xmin, ymin, xmax, ymax) + representation for comparison to point form ground truth data. + Args: + boxes: (tensor) center-size default boxes from priorbox layers. + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat( + ( + boxes[:, :2] - boxes[:, 2:] / 2, # xmin, ymin + boxes[:, :2] + boxes[:, 2:] / 2), + 1) # xmax, ymax + + +def center_size(boxes): + """ Convert prior_boxes to (cx, cy, w, h) + representation for comparison to center-size form ground truth data. + Args: + boxes: (tensor) point_form boxes + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat( + (boxes[:, 2:] + boxes[:, :2]) / 2, # cx, cy + boxes[:, 2:] - boxes[:, :2], + 1) # w, h + + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def matrix_iou(a, b): + """ + return iou of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + return area_i / (area_a[:, np.newaxis] + area_b - area_i) + + +def matrix_iof(a, b): + """ + return iof of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + return area_i / np.maximum(area_a[:, np.newaxis], 1) + + +def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx): + """Match each prior box with the ground truth box of the highest jaccard + overlap, encode the bounding boxes, then return the matched indices + corresponding to both confidence and location preds. + Args: + threshold: (float) The overlap threshold used when matching boxes. + truths: (tensor) Ground truth boxes, Shape: [num_obj, 4]. + priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. + variances: (tensor) Variances corresponding to each prior coord, + Shape: [num_priors, 4]. + labels: (tensor) All the class labels for the image, Shape: [num_obj]. + landms: (tensor) Ground truth landms, Shape [num_obj, 10]. + loc_t: (tensor) Tensor to be filled w/ encoded location targets. + conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. + landm_t: (tensor) Tensor to be filled w/ encoded landm targets. + idx: (int) current batch index + Return: + The matched indices corresponding to 1)location 2)confidence + 3)landm preds. + """ + # jaccard index + overlaps = jaccard(truths, point_form(priors)) + # (Bipartite Matching) + # [1,num_objects] best prior for each ground truth + best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) + + # ignore hard gt + valid_gt_idx = best_prior_overlap[:, 0] >= 0.2 + best_prior_idx_filter = best_prior_idx[valid_gt_idx, :] + if best_prior_idx_filter.shape[0] <= 0: + loc_t[idx] = 0 + conf_t[idx] = 0 + return + + # [1,num_priors] best ground truth for each prior + best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) + best_truth_idx.squeeze_(0) + best_truth_overlap.squeeze_(0) + best_prior_idx.squeeze_(1) + best_prior_idx_filter.squeeze_(1) + best_prior_overlap.squeeze_(1) + best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2) # ensure best prior + # TODO refactor: index best_prior_idx with long tensor + # ensure every gt matches with its prior of max overlap + for j in range(best_prior_idx.size(0)): # 判别此anchor是预测哪一个boxes + best_truth_idx[best_prior_idx[j]] = j + matches = truths[best_truth_idx] # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来 + conf = labels[best_truth_idx] # Shape: [num_priors] 此处为每一个anchor对应的label取出来 + conf[best_truth_overlap < threshold] = 0 # label as background overlap<0.35的全部作为负样本 + loc = encode(matches, priors, variances) + + matches_landm = landms[best_truth_idx] + landm = encode_landm(matches_landm, priors, variances) + loc_t[idx] = loc # [num_priors,4] encoded offsets to learn + conf_t[idx] = conf # [num_priors] top class label for each prior + landm_t[idx] = landm + + +def encode(matched, priors, variances): + """Encode the variances from the priorbox layers into the ground truth boxes + we have matched (based on jaccard overlap) with the prior boxes. + Args: + matched: (tensor) Coords of ground truth for each prior in point-form + Shape: [num_priors, 4]. + priors: (tensor) Prior boxes in center-offset form + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + encoded boxes (tensor), Shape: [num_priors, 4] + """ + + # dist b/t match center and prior's center + g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2] + # encode variance + g_cxcy /= (variances[0] * priors[:, 2:]) + # match wh / prior wh + g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] + g_wh = torch.log(g_wh) / variances[1] + # return target for smooth_l1_loss + return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] + + +def encode_landm(matched, priors, variances): + """Encode the variances from the priorbox layers into the ground truth boxes + we have matched (based on jaccard overlap) with the prior boxes. + Args: + matched: (tensor) Coords of ground truth for each prior in point-form + Shape: [num_priors, 10]. + priors: (tensor) Prior boxes in center-offset form + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + encoded landm (tensor), Shape: [num_priors, 10] + """ + + # dist b/t match center and prior's center + matched = torch.reshape(matched, (matched.size(0), 5, 2)) + priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2) + g_cxcy = matched[:, :, :2] - priors[:, :, :2] + # encode variance + g_cxcy /= (variances[0] * priors[:, :, 2:]) + # g_cxcy /= priors[:, :, 2:] + g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1) + # return target for smooth_l1_loss + return g_cxcy + + +# Adapted from https://github.com/Hakuyume/chainer-ssd +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def decode_landm(pre, priors, variances): + """Decode landm from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + pre (tensor): landm predictions for loc layers, + Shape: [num_priors,10] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded landm predictions + """ + tmp = ( + priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], + ) + landms = torch.cat(tmp, dim=1) + return landms + + +def batched_decode(b_loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + b_loc (tensor): location predictions for loc layers, + Shape: [num_batches,num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [1,num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + boxes = ( + priors[:, :, :2] + b_loc[:, :, :2] * variances[0] * priors[:, :, 2:], + priors[:, :, 2:] * torch.exp(b_loc[:, :, 2:] * variances[1]), + ) + boxes = torch.cat(boxes, dim=2) + + boxes[:, :, :2] -= boxes[:, :, 2:] / 2 + boxes[:, :, 2:] += boxes[:, :, :2] + return boxes + + +def batched_decode_landm(pre, priors, variances): + """Decode landm from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + pre (tensor): landm predictions for loc layers, + Shape: [num_batches,num_priors,10] + priors (tensor): Prior boxes in center-offset form. + Shape: [1,num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded landm predictions + """ + landms = ( + priors[:, :, :2] + pre[:, :, :2] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 2:4] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 4:6] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 6:8] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 8:10] * variances[0] * priors[:, :, 2:], + ) + landms = torch.cat(landms, dim=2) + return landms + + +def log_sum_exp(x): + """Utility function for computing log_sum_exp while determining + This will be used to determine unaveraged confidence loss across + all examples in a batch. + Args: + x (Variable(tensor)): conf_preds from conf layers + """ + x_max = x.data.max() + return torch.log(torch.sum(torch.exp(x - x_max), 1, keepdim=True)) + x_max + + +# Original author: Francisco Massa: +# https://github.com/fmassa/object-detection.torch +# Ported to PyTorch by Max deGroot (02/01/2017) +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = torch.Tensor(scores.size(0)).fill_(0).long() + if boxes.numel() == 0: + return keep + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count diff --git a/imaginairy/vendored/facexlib/headpose/__init__.py b/imaginairy/vendored/facexlib/headpose/__init__.py new file mode 100644 index 0000000..bbda8a8 --- /dev/null +++ b/imaginairy/vendored/facexlib/headpose/__init__.py @@ -0,0 +1,20 @@ +import torch + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .hopenet_arch import HopeNet + + +def init_headpose_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'hopenet': + model = HopeNet('resnet', [3, 4, 6, 3], 66) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.0/headpose_hopenet.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + load_net = torch.load(model_path, map_location=lambda storage, loc: storage)['params'] + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/headpose/hopenet_arch.py b/imaginairy/vendored/facexlib/headpose/hopenet_arch.py new file mode 100644 index 0000000..b3a0141 --- /dev/null +++ b/imaginairy/vendored/facexlib/headpose/hopenet_arch.py @@ -0,0 +1,72 @@ +import torch +import torch.nn as nn +import torchvision + + +class HopeNet(nn.Module): + # Hopenet with 3 output layers for yaw, pitch and roll + # Predicts Euler angles by binning and regression with the expected value + def __init__(self, block, layers, num_bins): + super(HopeNet, self).__init__() + if block == 'resnet': + block = torchvision.models.resnet.Bottleneck + self.inplanes = 64 + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7) + self.fc_yaw = nn.Linear(512 * block.expansion, num_bins) + self.fc_pitch = nn.Linear(512 * block.expansion, num_bins) + self.fc_roll = nn.Linear(512 * block.expansion, num_bins) + + self.idx_tensor = torch.arange(66).float() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + return nn.Sequential(*layers) + + @staticmethod + def softmax_temperature(tensor, temperature): + result = torch.exp(tensor / temperature) + result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result)) + return result + + def bin2degree(self, predict): + predict = self.softmax_temperature(predict, 1) + return torch.sum(predict * self.idx_tensor.type_as(predict), 1) * 3 - 99 + + def forward(self, x): + x = self.relu(self.bn1(self.conv1(x))) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + pre_yaw = self.fc_yaw(x) + pre_pitch = self.fc_pitch(x) + pre_roll = self.fc_roll(x) + + yaw = self.bin2degree(pre_yaw) + pitch = self.bin2degree(pre_pitch) + roll = self.bin2degree(pre_roll) + return yaw, pitch, roll diff --git a/imaginairy/vendored/facexlib/matting/__init__.py b/imaginairy/vendored/facexlib/matting/__init__.py new file mode 100644 index 0000000..b07381c --- /dev/null +++ b/imaginairy/vendored/facexlib/matting/__init__.py @@ -0,0 +1,27 @@ +import torch +from copy import deepcopy + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .modnet import MODNet + + +def init_matting_model(model_name='modnet', half=False, device='cuda', model_rootpath=None): + if model_name == 'modnet': + model = MODNet(backbone_pretrained=False) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.0/matting_modnet_portrait.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + # TODO: clean pretrained model + load_net = torch.load(model_path, map_location=lambda storage, loc: storage) + # remove unnecessary 'module.' + for k, v in deepcopy(load_net).items(): + if k.startswith('module.'): + load_net[k[7:]] = v + load_net.pop(k) + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/matting/backbone.py b/imaginairy/vendored/facexlib/matting/backbone.py new file mode 100644 index 0000000..4cb295f --- /dev/null +++ b/imaginairy/vendored/facexlib/matting/backbone.py @@ -0,0 +1,80 @@ +import os +import torch +import torch.nn as nn + +from .mobilenetv2 import MobileNetV2 + + +class BaseBackbone(nn.Module): + """ Superclass of Replaceable Backbone Model for Semantic Estimation + """ + + def __init__(self, in_channels): + super(BaseBackbone, self).__init__() + self.in_channels = in_channels + + self.model = None + self.enc_channels = [] + + def forward(self, x): + raise NotImplementedError + + def load_pretrained_ckpt(self): + raise NotImplementedError + + +class MobileNetV2Backbone(BaseBackbone): + """ MobileNetV2 Backbone + """ + + def __init__(self, in_channels): + super(MobileNetV2Backbone, self).__init__(in_channels) + + self.model = MobileNetV2(self.in_channels, alpha=1.0, expansion=6, num_classes=None) + self.enc_channels = [16, 24, 32, 96, 1280] + + def forward(self, x): + # x = reduce(lambda x, n: self.model.features[n](x), list(range(0, 2)), x) + x = self.model.features[0](x) + x = self.model.features[1](x) + enc2x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(2, 4)), x) + x = self.model.features[2](x) + x = self.model.features[3](x) + enc4x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(4, 7)), x) + x = self.model.features[4](x) + x = self.model.features[5](x) + x = self.model.features[6](x) + enc8x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(7, 14)), x) + x = self.model.features[7](x) + x = self.model.features[8](x) + x = self.model.features[9](x) + x = self.model.features[10](x) + x = self.model.features[11](x) + x = self.model.features[12](x) + x = self.model.features[13](x) + enc16x = x + + # x = reduce(lambda x, n: self.model.features[n](x), list(range(14, 19)), x) + x = self.model.features[14](x) + x = self.model.features[15](x) + x = self.model.features[16](x) + x = self.model.features[17](x) + x = self.model.features[18](x) + enc32x = x + return [enc2x, enc4x, enc8x, enc16x, enc32x] + + def load_pretrained_ckpt(self): + # the pre-trained model is provided by https://github.com/thuyngch/Human-Segmentation-PyTorch + ckpt_path = './pretrained/mobilenetv2_human_seg.ckpt' + if not os.path.exists(ckpt_path): + print('cannot find the pretrained mobilenetv2 backbone') + exit() + + ckpt = torch.load(ckpt_path) + self.model.load_state_dict(ckpt) diff --git a/imaginairy/vendored/facexlib/matting/mobilenetv2.py b/imaginairy/vendored/facexlib/matting/mobilenetv2.py new file mode 100644 index 0000000..c649586 --- /dev/null +++ b/imaginairy/vendored/facexlib/matting/mobilenetv2.py @@ -0,0 +1,192 @@ +""" This file is adapted from https://github.com/thuyngch/Human-Segmentation-PyTorch""" + +import math +import torch +from torch import nn + +# ------------------------------------------------------------------------------ +# Useful functions +# ------------------------------------------------------------------------------ + + +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +def conv_bn(inp, oup, stride): + return nn.Sequential(nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup), nn.ReLU6(inplace=True)) + + +def conv_1x1_bn(inp, oup): + return nn.Sequential(nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), nn.ReLU6(inplace=True)) + + +# ------------------------------------------------------------------------------ +# Class of Inverted Residual block +# ------------------------------------------------------------------------------ + + +class InvertedResidual(nn.Module): + + def __init__(self, inp, oup, stride, expansion, dilation=1): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = round(inp * expansion) + self.use_res_connect = self.stride == 1 and inp == oup + + if expansion == 1: + self.conv = nn.Sequential( + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + else: + self.conv = nn.Sequential( + # pw + nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +# ------------------------------------------------------------------------------ +# Class of MobileNetV2 +# ------------------------------------------------------------------------------ + + +class MobileNetV2(nn.Module): + + def __init__(self, in_channels, alpha=1.0, expansion=6, num_classes=1000): + super(MobileNetV2, self).__init__() + self.in_channels = in_channels + self.num_classes = num_classes + input_channel = 32 + last_channel = 1280 + interverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [expansion, 24, 2, 2], + [expansion, 32, 3, 2], + [expansion, 64, 4, 2], + [expansion, 96, 3, 1], + [expansion, 160, 3, 2], + [expansion, 320, 1, 1], + ] + + # building first layer + input_channel = _make_divisible(input_channel * alpha, 8) + self.last_channel = _make_divisible(last_channel * alpha, 8) if alpha > 1.0 else last_channel + self.features = [conv_bn(self.in_channels, input_channel, 2)] + + # building inverted residual blocks + for t, c, n, s in interverted_residual_setting: + output_channel = _make_divisible(int(c * alpha), 8) + for i in range(n): + if i == 0: + self.features.append(InvertedResidual(input_channel, output_channel, s, expansion=t)) + else: + self.features.append(InvertedResidual(input_channel, output_channel, 1, expansion=t)) + input_channel = output_channel + + # building last several layers + self.features.append(conv_1x1_bn(input_channel, self.last_channel)) + + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # building classifier + if self.num_classes is not None: + self.classifier = nn.Sequential( + nn.Dropout(0.2), + nn.Linear(self.last_channel, num_classes), + ) + + # Initialize weights + self._init_weights() + + def forward(self, x): + # Stage1 + x = self.features[0](x) + x = self.features[1](x) + # Stage2 + x = self.features[2](x) + x = self.features[3](x) + # Stage3 + x = self.features[4](x) + x = self.features[5](x) + x = self.features[6](x) + # Stage4 + x = self.features[7](x) + x = self.features[8](x) + x = self.features[9](x) + x = self.features[10](x) + x = self.features[11](x) + x = self.features[12](x) + x = self.features[13](x) + # Stage5 + x = self.features[14](x) + x = self.features[15](x) + x = self.features[16](x) + x = self.features[17](x) + x = self.features[18](x) + + # Classification + if self.num_classes is not None: + x = x.mean(dim=(2, 3)) + x = self.classifier(x) + + # Output + return x + + def _load_pretrained_model(self, pretrained_file): + pretrain_dict = torch.load(pretrained_file, map_location='cpu') + model_dict = {} + state_dict = self.state_dict() + print('[MobileNetV2] Loading pretrained model...') + for k, v in pretrain_dict.items(): + if k in state_dict: + model_dict[k] = v + else: + print(k, 'is ignored') + state_dict.update(model_dict) + self.load_state_dict(state_dict) + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/imaginairy/vendored/facexlib/matting/modnet.py b/imaginairy/vendored/facexlib/matting/modnet.py new file mode 100644 index 0000000..cd23c38 --- /dev/null +++ b/imaginairy/vendored/facexlib/matting/modnet.py @@ -0,0 +1,267 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .backbone import MobileNetV2Backbone + +# ------------------------------------------------------------------------------ +# MODNet Basic Modules +# ------------------------------------------------------------------------------ + + +class IBNorm(nn.Module): + """ Combine Instance Norm and Batch Norm into One Layer + """ + + def __init__(self, in_channels): + super(IBNorm, self).__init__() + in_channels = in_channels + self.bnorm_channels = int(in_channels / 2) + self.inorm_channels = in_channels - self.bnorm_channels + + self.bnorm = nn.BatchNorm2d(self.bnorm_channels, affine=True) + self.inorm = nn.InstanceNorm2d(self.inorm_channels, affine=False) + + def forward(self, x): + bn_x = self.bnorm(x[:, :self.bnorm_channels, ...].contiguous()) + in_x = self.inorm(x[:, self.bnorm_channels:, ...].contiguous()) + + return torch.cat((bn_x, in_x), 1) + + +class Conv2dIBNormRelu(nn.Module): + """ Convolution + IBNorm + ReLu + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + with_ibn=True, + with_relu=True): + super(Conv2dIBNormRelu, self).__init__() + + layers = [ + nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + ] + + if with_ibn: + layers.append(IBNorm(out_channels)) + if with_relu: + layers.append(nn.ReLU(inplace=True)) + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class SEBlock(nn.Module): + """ SE Block Proposed in https://arxiv.org/pdf/1709.01507.pdf + """ + + def __init__(self, in_channels, out_channels, reduction=1): + super(SEBlock, self).__init__() + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(in_channels, int(in_channels // reduction), bias=False), nn.ReLU(inplace=True), + nn.Linear(int(in_channels // reduction), out_channels, bias=False), nn.Sigmoid()) + + def forward(self, x): + b, c, _, _ = x.size() + w = self.pool(x).view(b, c) + w = self.fc(w).view(b, c, 1, 1) + + return x * w.expand_as(x) + + +# ------------------------------------------------------------------------------ +# MODNet Branches +# ------------------------------------------------------------------------------ + + +class LRBranch(nn.Module): + """ Low Resolution Branch of MODNet + """ + + def __init__(self, backbone): + super(LRBranch, self).__init__() + + enc_channels = backbone.enc_channels + + self.backbone = backbone + self.se_block = SEBlock(enc_channels[4], enc_channels[4], reduction=4) + self.conv_lr16x = Conv2dIBNormRelu(enc_channels[4], enc_channels[3], 5, stride=1, padding=2) + self.conv_lr8x = Conv2dIBNormRelu(enc_channels[3], enc_channels[2], 5, stride=1, padding=2) + self.conv_lr = Conv2dIBNormRelu( + enc_channels[2], 1, kernel_size=3, stride=2, padding=1, with_ibn=False, with_relu=False) + + def forward(self, img, inference): + enc_features = self.backbone.forward(img) + enc2x, enc4x, enc32x = enc_features[0], enc_features[1], enc_features[4] + + enc32x = self.se_block(enc32x) + lr16x = F.interpolate(enc32x, scale_factor=2, mode='bilinear', align_corners=False) + lr16x = self.conv_lr16x(lr16x) + lr8x = F.interpolate(lr16x, scale_factor=2, mode='bilinear', align_corners=False) + lr8x = self.conv_lr8x(lr8x) + + pred_semantic = None + if not inference: + lr = self.conv_lr(lr8x) + pred_semantic = torch.sigmoid(lr) + + return pred_semantic, lr8x, [enc2x, enc4x] + + +class HRBranch(nn.Module): + """ High Resolution Branch of MODNet + """ + + def __init__(self, hr_channels, enc_channels): + super(HRBranch, self).__init__() + + self.tohr_enc2x = Conv2dIBNormRelu(enc_channels[0], hr_channels, 1, stride=1, padding=0) + self.conv_enc2x = Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=2, padding=1) + + self.tohr_enc4x = Conv2dIBNormRelu(enc_channels[1], hr_channels, 1, stride=1, padding=0) + self.conv_enc4x = Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1) + + self.conv_hr4x = nn.Sequential( + Conv2dIBNormRelu(3 * hr_channels + 3, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1), + ) + + self.conv_hr2x = nn.Sequential( + Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1), + ) + + self.conv_hr = nn.Sequential( + Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=1, padding=1), + Conv2dIBNormRelu(hr_channels, 1, kernel_size=1, stride=1, padding=0, with_ibn=False, with_relu=False), + ) + + def forward(self, img, enc2x, enc4x, lr8x, inference): + img2x = F.interpolate(img, scale_factor=1 / 2, mode='bilinear', align_corners=False) + img4x = F.interpolate(img, scale_factor=1 / 4, mode='bilinear', align_corners=False) + + enc2x = self.tohr_enc2x(enc2x) + hr4x = self.conv_enc2x(torch.cat((img2x, enc2x), dim=1)) + + enc4x = self.tohr_enc4x(enc4x) + hr4x = self.conv_enc4x(torch.cat((hr4x, enc4x), dim=1)) + + lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False) + hr4x = self.conv_hr4x(torch.cat((hr4x, lr4x, img4x), dim=1)) + + hr2x = F.interpolate(hr4x, scale_factor=2, mode='bilinear', align_corners=False) + hr2x = self.conv_hr2x(torch.cat((hr2x, enc2x), dim=1)) + + pred_detail = None + if not inference: + hr = F.interpolate(hr2x, scale_factor=2, mode='bilinear', align_corners=False) + hr = self.conv_hr(torch.cat((hr, img), dim=1)) + pred_detail = torch.sigmoid(hr) + + return pred_detail, hr2x + + +class FusionBranch(nn.Module): + """ Fusion Branch of MODNet + """ + + def __init__(self, hr_channels, enc_channels): + super(FusionBranch, self).__init__() + self.conv_lr4x = Conv2dIBNormRelu(enc_channels[2], hr_channels, 5, stride=1, padding=2) + + self.conv_f2x = Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1) + self.conv_f = nn.Sequential( + Conv2dIBNormRelu(hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1), + Conv2dIBNormRelu(int(hr_channels / 2), 1, 1, stride=1, padding=0, with_ibn=False, with_relu=False), + ) + + def forward(self, img, lr8x, hr2x): + lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False) + lr4x = self.conv_lr4x(lr4x) + lr2x = F.interpolate(lr4x, scale_factor=2, mode='bilinear', align_corners=False) + + f2x = self.conv_f2x(torch.cat((lr2x, hr2x), dim=1)) + f = F.interpolate(f2x, scale_factor=2, mode='bilinear', align_corners=False) + f = self.conv_f(torch.cat((f, img), dim=1)) + pred_matte = torch.sigmoid(f) + + return pred_matte + + +# ------------------------------------------------------------------------------ +# MODNet +# ------------------------------------------------------------------------------ + + +class MODNet(nn.Module): + """ Architecture of MODNet + """ + + def __init__(self, in_channels=3, hr_channels=32, backbone_pretrained=True): + super(MODNet, self).__init__() + + self.in_channels = in_channels + self.hr_channels = hr_channels + self.backbone_pretrained = backbone_pretrained + + self.backbone = MobileNetV2Backbone(self.in_channels) + + self.lr_branch = LRBranch(self.backbone) + self.hr_branch = HRBranch(self.hr_channels, self.backbone.enc_channels) + self.f_branch = FusionBranch(self.hr_channels, self.backbone.enc_channels) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + self._init_conv(m) + elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.InstanceNorm2d): + self._init_norm(m) + + if self.backbone_pretrained: + self.backbone.load_pretrained_ckpt() + + def forward(self, img, inference): + pred_semantic, lr8x, [enc2x, enc4x] = self.lr_branch(img, inference) + pred_detail, hr2x = self.hr_branch(img, enc2x, enc4x, lr8x, inference) + pred_matte = self.f_branch(img, lr8x, hr2x) + + return pred_semantic, pred_detail, pred_matte + + def freeze_norm(self): + norm_types = [nn.BatchNorm2d, nn.InstanceNorm2d] + for m in self.modules(): + for n in norm_types: + if isinstance(m, n): + m.eval() + continue + + def _init_conv(self, conv): + nn.init.kaiming_uniform_(conv.weight, a=0, mode='fan_in', nonlinearity='relu') + if conv.bias is not None: + nn.init.constant_(conv.bias, 0) + + def _init_norm(self, norm): + if norm.weight is not None: + nn.init.constant_(norm.weight, 1) + nn.init.constant_(norm.bias, 0) diff --git a/imaginairy/vendored/facexlib/parsing/__init__.py b/imaginairy/vendored/facexlib/parsing/__init__.py new file mode 100644 index 0000000..9157eae --- /dev/null +++ b/imaginairy/vendored/facexlib/parsing/__init__.py @@ -0,0 +1,24 @@ +import torch + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .bisenet import BiSeNet +from .parsenet import ParseNet + + +def init_parsing_model(model_name='bisenet', half=False, device='cuda', model_rootpath=None): + if model_name == 'bisenet': + model = BiSeNet(num_class=19) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.0/parsing_bisenet.pth' + elif model_name == 'parsenet': + model = ParseNet(in_size=512, out_size=512, parsing_ch=19) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.2/parsing_parsenet.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + load_net = torch.load(model_path, map_location=lambda storage, loc: storage) + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/parsing/bisenet.py b/imaginairy/vendored/facexlib/parsing/bisenet.py new file mode 100644 index 0000000..3898cab --- /dev/null +++ b/imaginairy/vendored/facexlib/parsing/bisenet.py @@ -0,0 +1,140 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .resnet import ResNet18 + + +class ConvBNReLU(nn.Module): + + def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1): + super(ConvBNReLU, self).__init__() + self.conv = nn.Conv2d(in_chan, out_chan, kernel_size=ks, stride=stride, padding=padding, bias=False) + self.bn = nn.BatchNorm2d(out_chan) + + def forward(self, x): + x = self.conv(x) + x = F.relu(self.bn(x)) + return x + + +class BiSeNetOutput(nn.Module): + + def __init__(self, in_chan, mid_chan, num_class): + super(BiSeNetOutput, self).__init__() + self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1) + self.conv_out = nn.Conv2d(mid_chan, num_class, kernel_size=1, bias=False) + + def forward(self, x): + feat = self.conv(x) + out = self.conv_out(feat) + return out, feat + + +class AttentionRefinementModule(nn.Module): + + def __init__(self, in_chan, out_chan): + super(AttentionRefinementModule, self).__init__() + self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1) + self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size=1, bias=False) + self.bn_atten = nn.BatchNorm2d(out_chan) + self.sigmoid_atten = nn.Sigmoid() + + def forward(self, x): + feat = self.conv(x) + atten = F.avg_pool2d(feat, feat.size()[2:]) + atten = self.conv_atten(atten) + atten = self.bn_atten(atten) + atten = self.sigmoid_atten(atten) + out = torch.mul(feat, atten) + return out + + +class ContextPath(nn.Module): + + def __init__(self): + super(ContextPath, self).__init__() + self.resnet = ResNet18() + self.arm16 = AttentionRefinementModule(256, 128) + self.arm32 = AttentionRefinementModule(512, 128) + self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1) + self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1) + self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0) + + def forward(self, x): + feat8, feat16, feat32 = self.resnet(x) + h8, w8 = feat8.size()[2:] + h16, w16 = feat16.size()[2:] + h32, w32 = feat32.size()[2:] + + avg = F.avg_pool2d(feat32, feat32.size()[2:]) + avg = self.conv_avg(avg) + avg_up = F.interpolate(avg, (h32, w32), mode='nearest') + + feat32_arm = self.arm32(feat32) + feat32_sum = feat32_arm + avg_up + feat32_up = F.interpolate(feat32_sum, (h16, w16), mode='nearest') + feat32_up = self.conv_head32(feat32_up) + + feat16_arm = self.arm16(feat16) + feat16_sum = feat16_arm + feat32_up + feat16_up = F.interpolate(feat16_sum, (h8, w8), mode='nearest') + feat16_up = self.conv_head16(feat16_up) + + return feat8, feat16_up, feat32_up # x8, x8, x16 + + +class FeatureFusionModule(nn.Module): + + def __init__(self, in_chan, out_chan): + super(FeatureFusionModule, self).__init__() + self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0) + self.conv1 = nn.Conv2d(out_chan, out_chan // 4, kernel_size=1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_chan // 4, out_chan, kernel_size=1, stride=1, padding=0, bias=False) + self.relu = nn.ReLU(inplace=True) + self.sigmoid = nn.Sigmoid() + + def forward(self, fsp, fcp): + fcat = torch.cat([fsp, fcp], dim=1) + feat = self.convblk(fcat) + atten = F.avg_pool2d(feat, feat.size()[2:]) + atten = self.conv1(atten) + atten = self.relu(atten) + atten = self.conv2(atten) + atten = self.sigmoid(atten) + feat_atten = torch.mul(feat, atten) + feat_out = feat_atten + feat + return feat_out + + +class BiSeNet(nn.Module): + + def __init__(self, num_class): + super(BiSeNet, self).__init__() + self.cp = ContextPath() + self.ffm = FeatureFusionModule(256, 256) + self.conv_out = BiSeNetOutput(256, 256, num_class) + self.conv_out16 = BiSeNetOutput(128, 64, num_class) + self.conv_out32 = BiSeNetOutput(128, 64, num_class) + + def forward(self, x, return_feat=False): + h, w = x.size()[2:] + feat_res8, feat_cp8, feat_cp16 = self.cp(x) # return res3b1 feature + feat_sp = feat_res8 # replace spatial path feature with res3b1 feature + feat_fuse = self.ffm(feat_sp, feat_cp8) + + out, feat = self.conv_out(feat_fuse) + out16, feat16 = self.conv_out16(feat_cp8) + out32, feat32 = self.conv_out32(feat_cp16) + + out = F.interpolate(out, (h, w), mode='bilinear', align_corners=True) + out16 = F.interpolate(out16, (h, w), mode='bilinear', align_corners=True) + out32 = F.interpolate(out32, (h, w), mode='bilinear', align_corners=True) + + if return_feat: + feat = F.interpolate(feat, (h, w), mode='bilinear', align_corners=True) + feat16 = F.interpolate(feat16, (h, w), mode='bilinear', align_corners=True) + feat32 = F.interpolate(feat32, (h, w), mode='bilinear', align_corners=True) + return out, out16, out32, feat, feat16, feat32 + else: + return out, out16, out32 diff --git a/imaginairy/vendored/facexlib/parsing/parsenet.py b/imaginairy/vendored/facexlib/parsing/parsenet.py new file mode 100644 index 0000000..e178ebe --- /dev/null +++ b/imaginairy/vendored/facexlib/parsing/parsenet.py @@ -0,0 +1,194 @@ +"""Modified from https://github.com/chaofengc/PSFRGAN +""" +import numpy as np +import torch.nn as nn +from torch.nn import functional as F + + +class NormLayer(nn.Module): + """Normalization Layers. + + Args: + channels: input channels, for batch norm and instance norm. + input_size: input shape without batch size, for layer norm. + """ + + def __init__(self, channels, normalize_shape=None, norm_type='bn'): + super(NormLayer, self).__init__() + norm_type = norm_type.lower() + self.norm_type = norm_type + if norm_type == 'bn': + self.norm = nn.BatchNorm2d(channels, affine=True) + elif norm_type == 'in': + self.norm = nn.InstanceNorm2d(channels, affine=False) + elif norm_type == 'gn': + self.norm = nn.GroupNorm(32, channels, affine=True) + elif norm_type == 'pixel': + self.norm = lambda x: F.normalize(x, p=2, dim=1) + elif norm_type == 'layer': + self.norm = nn.LayerNorm(normalize_shape) + elif norm_type == 'none': + self.norm = lambda x: x * 1.0 + else: + assert 1 == 0, f'Norm type {norm_type} not support.' + + def forward(self, x, ref=None): + if self.norm_type == 'spade': + return self.norm(x, ref) + else: + return self.norm(x) + + +class ReluLayer(nn.Module): + """Relu Layer. + + Args: + relu type: type of relu layer, candidates are + - ReLU + - LeakyReLU: default relu slope 0.2 + - PRelu + - SELU + - none: direct pass + """ + + def __init__(self, channels, relu_type='relu'): + super(ReluLayer, self).__init__() + relu_type = relu_type.lower() + if relu_type == 'relu': + self.func = nn.ReLU(True) + elif relu_type == 'leakyrelu': + self.func = nn.LeakyReLU(0.2, inplace=True) + elif relu_type == 'prelu': + self.func = nn.PReLU(channels) + elif relu_type == 'selu': + self.func = nn.SELU(True) + elif relu_type == 'none': + self.func = lambda x: x * 1.0 + else: + assert 1 == 0, f'Relu type {relu_type} not support.' + + def forward(self, x): + return self.func(x) + + +class ConvLayer(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + scale='none', + norm_type='none', + relu_type='none', + use_pad=True, + bias=True): + super(ConvLayer, self).__init__() + self.use_pad = use_pad + self.norm_type = norm_type + if norm_type in ['bn']: + bias = False + + stride = 2 if scale == 'down' else 1 + + self.scale_func = lambda x: x + if scale == 'up': + self.scale_func = lambda x: nn.functional.interpolate(x, scale_factor=2, mode='nearest') + + self.reflection_pad = nn.ReflectionPad2d(int(np.ceil((kernel_size - 1.) / 2))) + self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, bias=bias) + + self.relu = ReluLayer(out_channels, relu_type) + self.norm = NormLayer(out_channels, norm_type=norm_type) + + def forward(self, x): + out = self.scale_func(x) + if self.use_pad: + out = self.reflection_pad(out) + out = self.conv2d(out) + out = self.norm(out) + out = self.relu(out) + return out + + +class ResidualBlock(nn.Module): + """ + Residual block recommended in: http://torch.ch/blog/2016/02/04/resnets.html + """ + + def __init__(self, c_in, c_out, relu_type='prelu', norm_type='bn', scale='none'): + super(ResidualBlock, self).__init__() + + if scale == 'none' and c_in == c_out: + self.shortcut_func = lambda x: x + else: + self.shortcut_func = ConvLayer(c_in, c_out, 3, scale) + + scale_config_dict = {'down': ['none', 'down'], 'up': ['up', 'none'], 'none': ['none', 'none']} + scale_conf = scale_config_dict[scale] + + self.conv1 = ConvLayer(c_in, c_out, 3, scale_conf[0], norm_type=norm_type, relu_type=relu_type) + self.conv2 = ConvLayer(c_out, c_out, 3, scale_conf[1], norm_type=norm_type, relu_type='none') + + def forward(self, x): + identity = self.shortcut_func(x) + + res = self.conv1(x) + res = self.conv2(res) + return identity + res + + +class ParseNet(nn.Module): + + def __init__(self, + in_size=128, + out_size=128, + min_feat_size=32, + base_ch=64, + parsing_ch=19, + res_depth=10, + relu_type='LeakyReLU', + norm_type='bn', + ch_range=[32, 256]): + super().__init__() + self.res_depth = res_depth + act_args = {'norm_type': norm_type, 'relu_type': relu_type} + min_ch, max_ch = ch_range + + ch_clip = lambda x: max(min_ch, min(x, max_ch)) # noqa: E731 + min_feat_size = min(in_size, min_feat_size) + + down_steps = int(np.log2(in_size // min_feat_size)) + up_steps = int(np.log2(out_size // min_feat_size)) + + # =============== define encoder-body-decoder ==================== + self.encoder = [] + self.encoder.append(ConvLayer(3, base_ch, 3, 1)) + head_ch = base_ch + for i in range(down_steps): + cin, cout = ch_clip(head_ch), ch_clip(head_ch * 2) + self.encoder.append(ResidualBlock(cin, cout, scale='down', **act_args)) + head_ch = head_ch * 2 + + self.body = [] + for i in range(res_depth): + self.body.append(ResidualBlock(ch_clip(head_ch), ch_clip(head_ch), **act_args)) + + self.decoder = [] + for i in range(up_steps): + cin, cout = ch_clip(head_ch), ch_clip(head_ch // 2) + self.decoder.append(ResidualBlock(cin, cout, scale='up', **act_args)) + head_ch = head_ch // 2 + + self.encoder = nn.Sequential(*self.encoder) + self.body = nn.Sequential(*self.body) + self.decoder = nn.Sequential(*self.decoder) + self.out_img_conv = ConvLayer(ch_clip(head_ch), 3) + self.out_mask_conv = ConvLayer(ch_clip(head_ch), parsing_ch) + + def forward(self, x): + feat = self.encoder(x) + x = feat + self.body(feat) + x = self.decoder(x) + out_img = self.out_img_conv(x) + out_mask = self.out_mask_conv(x) + return out_mask, out_img diff --git a/imaginairy/vendored/facexlib/parsing/resnet.py b/imaginairy/vendored/facexlib/parsing/resnet.py new file mode 100644 index 0000000..fec8e82 --- /dev/null +++ b/imaginairy/vendored/facexlib/parsing/resnet.py @@ -0,0 +1,69 @@ +import torch.nn as nn +import torch.nn.functional as F + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + + +class BasicBlock(nn.Module): + + def __init__(self, in_chan, out_chan, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(in_chan, out_chan, stride) + self.bn1 = nn.BatchNorm2d(out_chan) + self.conv2 = conv3x3(out_chan, out_chan) + self.bn2 = nn.BatchNorm2d(out_chan) + self.relu = nn.ReLU(inplace=True) + self.downsample = None + if in_chan != out_chan or stride != 1: + self.downsample = nn.Sequential( + nn.Conv2d(in_chan, out_chan, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(out_chan), + ) + + def forward(self, x): + residual = self.conv1(x) + residual = F.relu(self.bn1(residual)) + residual = self.conv2(residual) + residual = self.bn2(residual) + + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x) + + out = shortcut + residual + out = self.relu(out) + return out + + +def create_layer_basic(in_chan, out_chan, bnum, stride=1): + layers = [BasicBlock(in_chan, out_chan, stride=stride)] + for i in range(bnum - 1): + layers.append(BasicBlock(out_chan, out_chan, stride=1)) + return nn.Sequential(*layers) + + +class ResNet18(nn.Module): + + def __init__(self): + super(ResNet18, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1) + self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2) + self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2) + self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(self.bn1(x)) + x = self.maxpool(x) + + x = self.layer1(x) + feat8 = self.layer2(x) # 1/8 + feat16 = self.layer3(feat8) # 1/16 + feat32 = self.layer4(feat16) # 1/32 + return feat8, feat16, feat32 diff --git a/imaginairy/vendored/facexlib/readme.txt b/imaginairy/vendored/facexlib/readme.txt new file mode 100644 index 0000000..4e3131b --- /dev/null +++ b/imaginairy/vendored/facexlib/readme.txt @@ -0,0 +1 @@ +vendored from git@github.com:xinntao/facexlib.git @ 260620ae93990a300f4b16448df9bb459f1caba9 diff --git a/imaginairy/vendored/facexlib/recognition/__init__.py b/imaginairy/vendored/facexlib/recognition/__init__.py new file mode 100644 index 0000000..5d7571b --- /dev/null +++ b/imaginairy/vendored/facexlib/recognition/__init__.py @@ -0,0 +1,19 @@ +import torch + +from imaginairy.vendored.facexlib.utils import load_file_from_url +from .arcface_arch import Backbone + + +def init_recognition_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'arcface': + model = Backbone(num_layers=50, drop_ratio=0.6, mode='ir_se').to('cuda').eval() + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/recognition_arcface_ir_se50.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + model.load_state_dict(torch.load(model_path), strict=True) + model.eval() + model = model.to(device) + return model diff --git a/imaginairy/vendored/facexlib/recognition/arcface_arch.py b/imaginairy/vendored/facexlib/recognition/arcface_arch.py new file mode 100644 index 0000000..0595f86 --- /dev/null +++ b/imaginairy/vendored/facexlib/recognition/arcface_arch.py @@ -0,0 +1,238 @@ +import torch +from collections import namedtuple +from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, MaxPool2d, Module, PReLU, + ReLU, Sequential, Sigmoid) + +# Original Arcface Model + + +class Flatten(Module): + + def forward(self, input): + return input.view(input.size(0), -1) + + +def l2_norm(input, axis=1): + norm = torch.norm(input, 2, axis, True) + output = torch.div(input, norm) + return output + + +class SEModule(Module): + + def __init__(self, channels, reduction): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2d(1) + self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False) + self.relu = ReLU(inplace=True) + self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False) + self.sigmoid = Sigmoid() + + def forward(self, x): + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class bottleneck_IR(Module): + + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential(Conv2d(in_channel, depth, (1, 1), stride, bias=False), BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth), + Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +class bottleneck_IR_SE(Module): + + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR_SE, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential(Conv2d(in_channel, depth, (1, 1), stride, bias=False), BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth), + Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth), SEModule(depth, 16)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): + '''A named tuple describing a ResNet block.''' + + +def get_block(in_channel, depth, num_units, stride=2): + return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] + + +def get_blocks(num_layers): + if num_layers == 50: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=4), + get_block(in_channel=128, depth=256, num_units=14), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 100: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=13), + get_block(in_channel=128, depth=256, num_units=30), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 152: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=8), + get_block(in_channel=128, depth=256, num_units=36), + get_block(in_channel=256, depth=512, num_units=3) + ] + return blocks + + +class Backbone(Module): + + def __init__(self, num_layers, drop_ratio, mode='ir'): + super(Backbone, self).__init__() + assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152' + assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se' + blocks = get_blocks(num_layers) + if mode == 'ir': + unit_module = bottleneck_IR + elif mode == 'ir_se': + unit_module = bottleneck_IR_SE + self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64), PReLU(64)) + self.output_layer = Sequential( + BatchNorm2d(512), Dropout(drop_ratio), Flatten(), Linear(512 * 7 * 7, 512), BatchNorm1d(512)) + modules = [] + for block in blocks: + for bottleneck in block: + modules.append(unit_module(bottleneck.in_channel, bottleneck.depth, bottleneck.stride)) + self.body = Sequential(*modules) + + def forward(self, x): + x = self.input_layer(x) + x = self.body(x) + x = self.output_layer(x) + return l2_norm(x) + + +# MobileFaceNet + + +class Conv_block(Module): + + def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): + super(Conv_block, self).__init__() + self.conv = Conv2d( + in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) + self.bn = BatchNorm2d(out_c) + self.prelu = PReLU(out_c) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.prelu(x) + return x + + +class Linear_block(Module): + + def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1): + super(Linear_block, self).__init__() + self.conv = Conv2d( + in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False) + self.bn = BatchNorm2d(out_c) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Depth_Wise(Module): + + def __init__(self, in_c, out_c, residual=False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1): + super(Depth_Wise, self).__init__() + self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride) + self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1)) + self.residual = residual + + def forward(self, x): + if self.residual: + short_cut = x + x = self.conv(x) + x = self.conv_dw(x) + x = self.project(x) + if self.residual: + output = short_cut + x + else: + output = x + return output + + +class Residual(Module): + + def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)): + super(Residual, self).__init__() + modules = [] + for _ in range(num_block): + modules.append( + Depth_Wise(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups)) + self.model = Sequential(*modules) + + def forward(self, x): + return self.model(x) + + +class MobileFaceNet(Module): + + def __init__(self, embedding_size): + super(MobileFaceNet, self).__init__() + self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1)) + self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64) + self.conv_23 = Depth_Wise(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128) + self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + self.conv_34 = Depth_Wise(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256) + self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + self.conv_45 = Depth_Wise(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512) + self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1)) + self.conv_6_sep = Conv_block(128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0)) + self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0)) + self.conv_6_flatten = Flatten() + self.linear = Linear(512, embedding_size, bias=False) + self.bn = BatchNorm1d(embedding_size) + + def forward(self, x): + out = self.conv1(x) + out = self.conv2_dw(out) + out = self.conv_23(out) + out = self.conv_3(out) + out = self.conv_34(out) + out = self.conv_4(out) + out = self.conv_45(out) + out = self.conv_5(out) + out = self.conv_6_sep(out) + out = self.conv_6_dw(out) + out = self.conv_6_flatten(out) + out = self.linear(out) + out = self.bn(out) + return l2_norm(out) diff --git a/imaginairy/vendored/facexlib/tracking/README.md b/imaginairy/vendored/facexlib/tracking/README.md new file mode 100644 index 0000000..cd67a5d --- /dev/null +++ b/imaginairy/vendored/facexlib/tracking/README.md @@ -0,0 +1 @@ +https://github.com/abewley/sort diff --git a/imaginairy/vendored/facexlib/tracking/__init__.py b/imaginairy/vendored/facexlib/tracking/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/imaginairy/vendored/facexlib/tracking/data_association.py b/imaginairy/vendored/facexlib/tracking/data_association.py new file mode 100644 index 0000000..c71cac8 --- /dev/null +++ b/imaginairy/vendored/facexlib/tracking/data_association.py @@ -0,0 +1,71 @@ +""" +For each detected item, it computes the intersection over union (IOU) w.r.t. +each tracked object. (IOU matrix) +Then, it applies the Hungarian algorithm (via linear_assignment) to assign each +det. item to the best possible tracked item (i.e. to the one with max IOU) +""" + +import numpy as np +from numba import jit +from scipy.optimize import linear_sum_assignment as linear_assignment + + +@jit +def iou(bb_test, bb_gt): + """Computes IOU between two bboxes in the form [x1,y1,x2,y2] + """ + xx1 = np.maximum(bb_test[0], bb_gt[0]) + yy1 = np.maximum(bb_test[1], bb_gt[1]) + xx2 = np.minimum(bb_test[2], bb_gt[2]) + yy2 = np.minimum(bb_test[3], bb_gt[3]) + w = np.maximum(0., xx2 - xx1) + h = np.maximum(0., yy2 - yy1) + wh = w * h + o = wh / ((bb_test[2] - bb_test[0]) * (bb_test[3] - bb_test[1]) + (bb_gt[2] - bb_gt[0]) * + (bb_gt[3] - bb_gt[1]) - wh) + return (o) + + +def associate_detections_to_trackers(detections, trackers, iou_threshold=0.25): + """Assigns detections to tracked object (both represented as bounding boxes) + + Returns: + 3 lists of matches, unmatched_detections and unmatched_trackers. + """ + if len(trackers) == 0: + return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) + + iou_matrix = np.zeros((len(detections), len(trackers)), dtype=np.float32) + + for d, det in enumerate(detections): + for t, trk in enumerate(trackers): + iou_matrix[d, t] = iou(det, trk) + # The linear assignment module tries to minimize the total assignment cost. + # In our case we pass -iou_matrix as we want to maximise the total IOU + # between track predictions and the frame detection. + row_ind, col_ind = linear_assignment(-iou_matrix) + + unmatched_detections = [] + for d, det in enumerate(detections): + if d not in row_ind: + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if t not in col_ind: + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for row, col in zip(row_ind, col_ind): + if iou_matrix[row, col] < iou_threshold: + unmatched_detections.append(row) + unmatched_trackers.append(col) + else: + matches.append(np.array([[row, col]])) + + if len(matches) == 0: + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) diff --git a/imaginairy/vendored/facexlib/tracking/kalman_tracker.py b/imaginairy/vendored/facexlib/tracking/kalman_tracker.py new file mode 100644 index 0000000..eab642c --- /dev/null +++ b/imaginairy/vendored/facexlib/tracking/kalman_tracker.py @@ -0,0 +1,108 @@ +import numpy as np +from filterpy.kalman import KalmanFilter + + +def convert_bbox_to_z(bbox): + """Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form + [x,y,s,r] where x,y is the centre of the box and s is the scale/area and + r is the aspect ratio + """ + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x = bbox[0] + w / 2. + y = bbox[1] + h / 2. + s = w * h # scale is just area + r = w / float(h) + return np.array([x, y, s, r]).reshape((4, 1)) + + +def convert_x_to_bbox(x, score=None): + """Takes a bounding box in the centre form [x,y,s,r] and returns it in + the form [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom + right + """ + w = np.sqrt(x[2] * x[3]) + h = x[2] / w + if score is None: + return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2.]).reshape((1, 4)) + else: + return np.array([x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score]).reshape((1, 5)) + + +class KalmanBoxTracker(object): + """This class represents the internal state of individual tracked objects + observed as bbox. + doc: https://filterpy.readthedocs.io/en/latest/kalman/KalmanFilter.html + """ + count = 0 + + def __init__(self, bbox): + """Initialize a tracker using initial bounding box. + """ + # define constant velocity model + # TODO: x: what is the meanning of x[4:7], v? + self.kf = KalmanFilter(dim_x=7, dim_z=4) + # F (dim_x, dim_x): state transition matrix + self.kf.F = np.array([[1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, + 1], [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1]]) + # H (dim_z, dim_x): measurement function + self.kf.H = np.array([[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0]]) + # R (dim_z, dim_z): measurement uncertainty/noise + self.kf.R[2:, 2:] *= 10. + # P (dim_x, dim_x): covariance matrix + # give high uncertainty to the unobservable initial velocities + self.kf.P[4:, 4:] *= 1000. + self.kf.P *= 10. + # Q (dim_x, dim_x): Process uncertainty/noise + self.kf.Q[-1, -1] *= 0.01 + self.kf.Q[4:, 4:] *= 0.01 + # x (dim_x, 1): filter state estimate + self.kf.x[:4] = convert_bbox_to_z(bbox) + + self.time_since_update = 0 + self.id = KalmanBoxTracker.count + KalmanBoxTracker.count += 1 + self.history = [] + self.hits = 0 + self.hit_streak = 0 + self.age = 0 + + # 解决画面中无人脸检测到时而导致的原有追踪器人像预测的漂移bug + self.predict_num = 0 # 连续预测的数目 + + # additional fields + self.face_attributes = [] + + def update(self, bbox): + """Updates the state vector with observed bbox. + """ + self.time_since_update = 0 + self.history = [] + self.hits += 1 + self.hit_streak += 1 # 连续命中 + if bbox != []: + self.kf.update(convert_bbox_to_z(bbox)) + self.predict_num = 0 + else: + self.predict_num += 1 + + def predict(self): + """Advances the state vector and returns the predicted bounding box + estimate. + """ + + if (self.kf.x[6] + self.kf.x[2]) <= 0: + self.kf.x[6] *= 0.0 + self.kf.predict() + self.age += 1 + if self.time_since_update > 0: + self.hit_streak = 0 + self.time_since_update += 1 + self.history.append(convert_x_to_bbox(self.kf.x)) + return self.history[-1][0] + + def get_state(self): + """Returns the current bounding box estimate.""" + return convert_x_to_bbox(self.kf.x)[0] diff --git a/imaginairy/vendored/facexlib/tracking/sort.py b/imaginairy/vendored/facexlib/tracking/sort.py new file mode 100644 index 0000000..92aa308 --- /dev/null +++ b/imaginairy/vendored/facexlib/tracking/sort.py @@ -0,0 +1,92 @@ +import numpy as np + +from imaginairy.vendored.facexlib.tracking.data_association import associate_detections_to_trackers +from imaginairy.vendored.facexlib.tracking.kalman_tracker import KalmanBoxTracker + + +class SORT(object): + """SORT: A Simple, Online and Realtime Tracker. + + Ref: https://github.com/abewley/sort + """ + + def __init__(self, max_age=1, min_hits=3, iou_threshold=0.3): + self.max_age = max_age + self.min_hits = min_hits # 最小的连续命中, 只有满足的才会被返回 + self.iou_threshold = iou_threshold + self.trackers = [] + self.frame_count = 0 + + def update(self, dets, img_size, additional_attr, detect_interval): + """This method must be called once for each frame even with + empty detections. + NOTE:as in practical realtime MOT, the detector doesn't run on every + single frame. + + Args: + dets (Numpy array): detections in the format + [[x0,y0,x1,y1,score], [x0,y0,x1,y1,score], ...] + + Returns: + a similar array, where the last column is the object ID. + """ + self.frame_count += 1 + + # get predicted locations from existing trackers + trks = np.zeros((len(self.trackers), 5)) + to_del = [] # To be deleted + ret = [] + # predict tracker position using Kalman filter + for t, trk in enumerate(trks): + pos = self.trackers[t].predict() # Kalman predict ,very fast ,<1ms + trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] + if np.any(np.isnan(pos)): + to_del.append(t) + trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) + for t in reversed(to_del): + self.trackers.pop(t) + + if dets != []: + matched, unmatched_dets, unmatched_trks = associate_detections_to_trackers( # noqa: E501 + dets, trks) + + # update matched trackers with assigned detections + for t, trk in enumerate(self.trackers): + if t not in unmatched_trks: + d = matched[np.where(matched[:, 1] == t)[0], 0] + trk.update(dets[d, :][0]) + trk.face_attributes.append(additional_attr[d[0]]) + + # create and initialize new trackers for unmatched detections + for i in unmatched_dets: + trk = KalmanBoxTracker(dets[i, :]) + trk.face_attributes.append(additional_attr[i]) + print(f'New tracker: {trk.id + 1}.') + self.trackers.append(trk) + + i = len(self.trackers) + for trk in reversed(self.trackers): + if dets == []: + trk.update([]) + + d = trk.get_state() + # get return tracklet + # 1) time_since_update < 1: detected + # 2) i) hit_streak >= min_hits: 最小的连续命中 + # ii) frame_count <= min_hits: 最开始的几帧 + if (trk.time_since_update < 1) and (trk.hit_streak >= self.min_hits or self.frame_count <= self.min_hits): + ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) # +1 as MOT benchmark requires positive + i -= 1 + + # remove dead tracklet + # 1) time_since_update >= max_age: 多久没有更新了 + # 2) predict_num: 连续预测的帧数 + # 3) out of image size + if (trk.time_since_update >= self.max_age) or (trk.predict_num >= detect_interval) or ( + d[2] < 0 or d[3] < 0 or d[0] > img_size[1] or d[1] > img_size[0]): + print(f'Remove tracker: {trk.id + 1}') + self.trackers.pop(i) + if len(ret) > 0: + return np.concatenate(ret) + else: + return np.empty((0, 5)) diff --git a/imaginairy/vendored/facexlib/utils/__init__.py b/imaginairy/vendored/facexlib/utils/__init__.py new file mode 100644 index 0000000..706e077 --- /dev/null +++ b/imaginairy/vendored/facexlib/utils/__init__.py @@ -0,0 +1,7 @@ +from .face_utils import align_crop_face_landmarks, compute_increased_bbox, get_valid_bboxes, paste_face_back +from .misc import img2tensor, load_file_from_url, scandir + +__all__ = [ + 'align_crop_face_landmarks', 'compute_increased_bbox', 'get_valid_bboxes', 'load_file_from_url', 'paste_face_back', + 'img2tensor', 'scandir' +] diff --git a/imaginairy/vendored/facexlib/utils/face_restoration_helper.py b/imaginairy/vendored/facexlib/utils/face_restoration_helper.py new file mode 100644 index 0000000..05500a5 --- /dev/null +++ b/imaginairy/vendored/facexlib/utils/face_restoration_helper.py @@ -0,0 +1,374 @@ +import cv2 +import numpy as np +import os +import torch +from torchvision.transforms.functional import normalize + +from imaginairy.vendored.facexlib.detection import init_detection_model +from imaginairy.vendored.facexlib.parsing import init_parsing_model +from imaginairy.vendored.facexlib.utils.misc import img2tensor, imwrite + + +def get_largest_face(det_faces, h, w): + + def get_location(val, length): + if val < 0: + return 0 + elif val > length: + return length + else: + return val + + face_areas = [] + for det_face in det_faces: + left = get_location(det_face[0], w) + right = get_location(det_face[2], w) + top = get_location(det_face[1], h) + bottom = get_location(det_face[3], h) + face_area = (right - left) * (bottom - top) + face_areas.append(face_area) + largest_idx = face_areas.index(max(face_areas)) + return det_faces[largest_idx], largest_idx + + +def get_center_face(det_faces, h=0, w=0, center=None): + if center is not None: + center = np.array(center) + else: + center = np.array([w / 2, h / 2]) + center_dist = [] + for det_face in det_faces: + face_center = np.array([(det_face[0] + det_face[2]) / 2, (det_face[1] + det_face[3]) / 2]) + dist = np.linalg.norm(face_center - center) + center_dist.append(dist) + center_idx = center_dist.index(min(center_dist)) + return det_faces[center_idx], center_idx + + +class FaceRestoreHelper(object): + """Helper for the face restoration pipeline (base class).""" + + def __init__(self, + upscale_factor, + face_size=512, + crop_ratio=(1, 1), + det_model='retinaface_resnet50', + save_ext='png', + template_3points=False, + pad_blur=False, + use_parse=False, + device=None, + model_rootpath=None): + self.template_3points = template_3points # improve robustness + self.upscale_factor = upscale_factor + # the cropped face ratio based on the square face + self.crop_ratio = crop_ratio # (h, w) + assert (self.crop_ratio[0] >= 1 and self.crop_ratio[1] >= 1), 'crop ration only supports >=1' + self.face_size = (int(face_size * self.crop_ratio[1]), int(face_size * self.crop_ratio[0])) + + if self.template_3points: + self.face_template = np.array([[192, 240], [319, 240], [257, 371]]) + else: + # standard 5 landmarks for FFHQ faces with 512 x 512 + self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935], + [201.26117, 371.41043], [313.08905, 371.15118]]) + self.face_template = self.face_template * (face_size / 512.0) + if self.crop_ratio[0] > 1: + self.face_template[:, 1] += face_size * (self.crop_ratio[0] - 1) / 2 + if self.crop_ratio[1] > 1: + self.face_template[:, 0] += face_size * (self.crop_ratio[1] - 1) / 2 + self.save_ext = save_ext + self.pad_blur = pad_blur + if self.pad_blur is True: + self.template_3points = False + + self.all_landmarks_5 = [] + self.det_faces = [] + self.affine_matrices = [] + self.inverse_affine_matrices = [] + self.cropped_faces = [] + self.restored_faces = [] + self.pad_input_imgs = [] + + if device is None: + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + else: + self.device = device + + # init face detection model + self.face_det = init_detection_model(det_model, half=False, device=self.device, model_rootpath=model_rootpath) + + # init face parsing model + self.use_parse = use_parse + self.face_parse = init_parsing_model(model_name='parsenet', device=self.device, model_rootpath=model_rootpath) + + def set_upscale_factor(self, upscale_factor): + self.upscale_factor = upscale_factor + + def read_image(self, img): + """img can be image path or cv2 loaded image.""" + # self.input_img is Numpy array, (h, w, c), BGR, uint8, [0, 255] + if isinstance(img, str): + img = cv2.imread(img) + + if np.max(img) > 256: # 16-bit image + img = img / 65535 * 255 + if len(img.shape) == 2: # gray image + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif img.shape[2] == 4: # RGBA image with alpha channel + img = img[:, :, 0:3] + + self.input_img = img + + def get_face_landmarks_5(self, + only_keep_largest=False, + only_center_face=False, + resize=None, + blur_ratio=0.01, + eye_dist_threshold=None): + if resize is None: + scale = 1 + input_img = self.input_img + else: + h, w = self.input_img.shape[0:2] + scale = min(h, w) / resize + h, w = int(h / scale), int(w / scale) + input_img = cv2.resize(self.input_img, (w, h), interpolation=cv2.INTER_LANCZOS4) + + with torch.no_grad(): + bboxes = self.face_det.detect_faces(input_img, 0.97) * scale + for bbox in bboxes: + # remove faces with too small eye distance: side faces or too small faces + eye_dist = np.linalg.norm([bbox[5] - bbox[7], bbox[6] - bbox[8]]) + if eye_dist_threshold is not None and (eye_dist < eye_dist_threshold): + continue + + if self.template_3points: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 11, 2)]) + else: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)]) + self.all_landmarks_5.append(landmark) + self.det_faces.append(bbox[0:5]) + if len(self.det_faces) == 0: + return 0 + if only_keep_largest: + h, w, _ = self.input_img.shape + self.det_faces, largest_idx = get_largest_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[largest_idx]] + elif only_center_face: + h, w, _ = self.input_img.shape + self.det_faces, center_idx = get_center_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[center_idx]] + + # pad blurry images + if self.pad_blur: + self.pad_input_imgs = [] + for landmarks in self.all_landmarks_5: + # get landmarks + eye_left = landmarks[0, :] + eye_right = landmarks[1, :] + eye_avg = (eye_left + eye_right) * 0.5 + mouth_avg = (landmarks[3, :] + landmarks[4, :]) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1.5 + x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + border = max(int(np.rint(qsize * 0.1)), 3) + + # get pad + # pad: (width_left, height_top, width_right, height_bottom) + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + pad = [ + max(-pad[0] + border, 1), + max(-pad[1] + border, 1), + max(pad[2] - self.input_img.shape[0] + border, 1), + max(pad[3] - self.input_img.shape[1] + border, 1) + ] + + if max(pad) > 1: + # pad image + pad_img = np.pad(self.input_img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') + # modify landmark coords + landmarks[:, 0] += pad[0] + landmarks[:, 1] += pad[1] + # blur pad images + h, w, _ = pad_img.shape + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), + 1.0 - np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = int(qsize * blur_ratio) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(pad_img, 0, ksize=(blur, blur)) + # blur_img = cv2.GaussianBlur(pad_img, (blur, blur), 0) + + pad_img = pad_img.astype('float32') + pad_img += (blur_img - pad_img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + pad_img += (np.median(pad_img, axis=(0, 1)) - pad_img) * np.clip(mask, 0.0, 1.0) + pad_img = np.clip(pad_img, 0, 255) # float32, [0, 255] + self.pad_input_imgs.append(pad_img) + else: + self.pad_input_imgs.append(np.copy(self.input_img)) + + return len(self.all_landmarks_5) + + def align_warp_face(self, save_cropped_path=None, border_mode='constant'): + """Align and warp faces with face template. + """ + if self.pad_blur: + assert len(self.pad_input_imgs) == len( + self.all_landmarks_5), f'Mismatched samples: {len(self.pad_input_imgs)} and {len(self.all_landmarks_5)}' + for idx, landmark in enumerate(self.all_landmarks_5): + # use 5 landmarks to get affine matrix + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D(landmark, self.face_template, method=cv2.LMEDS)[0] + self.affine_matrices.append(affine_matrix) + # warp and crop faces + if border_mode == 'constant': + border_mode = cv2.BORDER_CONSTANT + elif border_mode == 'reflect101': + border_mode = cv2.BORDER_REFLECT101 + elif border_mode == 'reflect': + border_mode = cv2.BORDER_REFLECT + if self.pad_blur: + input_img = self.pad_input_imgs[idx] + else: + input_img = self.input_img + cropped_face = cv2.warpAffine( + input_img, affine_matrix, self.face_size, borderMode=border_mode, borderValue=(135, 133, 132)) # gray + self.cropped_faces.append(cropped_face) + # save the cropped face + if save_cropped_path is not None: + path = os.path.splitext(save_cropped_path)[0] + save_path = f'{path}_{idx:02d}.{self.save_ext}' + imwrite(cropped_face, save_path) + + def get_inverse_affine(self, save_inverse_affine_path=None): + """Get inverse affine matrix.""" + for idx, affine_matrix in enumerate(self.affine_matrices): + inverse_affine = cv2.invertAffineTransform(affine_matrix) + inverse_affine *= self.upscale_factor + self.inverse_affine_matrices.append(inverse_affine) + # save inverse affine matrices + if save_inverse_affine_path is not None: + path, _ = os.path.splitext(save_inverse_affine_path) + save_path = f'{path}_{idx:02d}.pth' + torch.save(inverse_affine, save_path) + + def add_restored_face(self, face): + self.restored_faces.append(face) + + def paste_faces_to_input_image(self, save_path=None, upsample_img=None): + h, w, _ = self.input_img.shape + h_up, w_up = int(h * self.upscale_factor), int(w * self.upscale_factor) + + if upsample_img is None: + # simply resize the background + upsample_img = cv2.resize(self.input_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4) + else: + upsample_img = cv2.resize(upsample_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4) + + assert len(self.restored_faces) == len( + self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.') + for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices): + # Add an offset to inverse affine matrix, for more precise back alignment + if self.upscale_factor > 1: + extra_offset = 0.5 * self.upscale_factor + else: + extra_offset = 0 + inverse_affine[:, 2] += extra_offset + inv_restored = cv2.warpAffine(restored_face, inverse_affine, (w_up, h_up)) + + if self.use_parse: + # inference + face_input = cv2.resize(restored_face, (512, 512), interpolation=cv2.INTER_LINEAR) + face_input = img2tensor(face_input.astype('float32') / 255., bgr2rgb=True, float32=True) + normalize(face_input, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + face_input = torch.unsqueeze(face_input, 0).to(self.device) + with torch.no_grad(): + out = self.face_parse(face_input)[0] + out = out.argmax(dim=1).squeeze().cpu().numpy() + + mask = np.zeros(out.shape) + MASK_COLORMAP = [0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 255, 0, 0, 0] + for idx, color in enumerate(MASK_COLORMAP): + mask[out == idx] = color + # blur the mask + mask = cv2.GaussianBlur(mask, (101, 101), 11) + mask = cv2.GaussianBlur(mask, (101, 101), 11) + # remove the black borders + thres = 10 + mask[:thres, :] = 0 + mask[-thres:, :] = 0 + mask[:, :thres] = 0 + mask[:, -thres:] = 0 + mask = mask / 255. + + mask = cv2.resize(mask, restored_face.shape[:2]) + mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up), flags=3) + inv_soft_mask = mask[:, :, None] + pasted_face = inv_restored + + else: # use square parse maps + mask = np.ones(self.face_size, dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up)) + # remove the black borders + inv_mask_erosion = cv2.erode( + inv_mask, np.ones((int(2 * self.upscale_factor), int(2 * self.upscale_factor)), np.uint8)) + pasted_face = inv_mask_erosion[:, :, None] * inv_restored + total_face_area = np.sum(inv_mask_erosion) # // 3 + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + if len(upsample_img.shape) == 2: # upsample_img is gray image + upsample_img = upsample_img[:, :, None] + inv_soft_mask = inv_soft_mask[:, :, None] + + if len(upsample_img.shape) == 3 and upsample_img.shape[2] == 4: # alpha channel + alpha = upsample_img[:, :, 3:] + upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img[:, :, 0:3] + upsample_img = np.concatenate((upsample_img, alpha), axis=2) + else: + upsample_img = inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img + + if np.max(upsample_img) > 256: # 16-bit image + upsample_img = upsample_img.astype(np.uint16) + else: + upsample_img = upsample_img.astype(np.uint8) + if save_path is not None: + path = os.path.splitext(save_path)[0] + save_path = f'{path}.{self.save_ext}' + imwrite(upsample_img, save_path) + return upsample_img + + def clean_all(self): + self.all_landmarks_5 = [] + self.restored_faces = [] + self.affine_matrices = [] + self.cropped_faces = [] + self.inverse_affine_matrices = [] + self.det_faces = [] + self.pad_input_imgs = [] diff --git a/imaginairy/vendored/facexlib/utils/face_utils.py b/imaginairy/vendored/facexlib/utils/face_utils.py new file mode 100644 index 0000000..1a39419 --- /dev/null +++ b/imaginairy/vendored/facexlib/utils/face_utils.py @@ -0,0 +1,250 @@ +import cv2 +import numpy as np +import torch + + +def compute_increased_bbox(bbox, increase_area, preserve_aspect=True): + left, top, right, bot = bbox + width = right - left + height = bot - top + + if preserve_aspect: + width_increase = max(increase_area, ((1 + 2 * increase_area) * height - width) / (2 * width)) + height_increase = max(increase_area, ((1 + 2 * increase_area) * width - height) / (2 * height)) + else: + width_increase = height_increase = increase_area + left = int(left - width_increase * width) + top = int(top - height_increase * height) + right = int(right + width_increase * width) + bot = int(bot + height_increase * height) + return (left, top, right, bot) + + +def get_valid_bboxes(bboxes, h, w): + left = max(bboxes[0], 0) + top = max(bboxes[1], 0) + right = min(bboxes[2], w) + bottom = min(bboxes[3], h) + return (left, top, right, bottom) + + +def align_crop_face_landmarks(img, + landmarks, + output_size, + transform_size=None, + enable_padding=True, + return_inverse_affine=False, + shrink_ratio=(1, 1)): + """Align and crop face with landmarks. + + The output_size and transform_size are based on width. The height is + adjusted based on shrink_ratio_h/shring_ration_w. + + Modified from: + https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py + + Args: + img (Numpy array): Input image. + landmarks (Numpy array): 5 or 68 or 98 landmarks. + output_size (int): Output face size. + transform_size (ing): Transform size. Usually the four time of + output_size. + enable_padding (float): Default: True. + shrink_ratio (float | tuple[float] | list[float]): Shring the whole + face for height and width (crop larger area). Default: (1, 1). + + Returns: + (Numpy array): Cropped face. + """ + lm_type = 'retinaface_5' # Options: dlib_5, retinaface_5 + + if isinstance(shrink_ratio, (float, int)): + shrink_ratio = (shrink_ratio, shrink_ratio) + if transform_size is None: + transform_size = output_size * 4 + + # Parse landmarks + lm = np.array(landmarks) + if lm.shape[0] == 5 and lm_type == 'retinaface_5': + eye_left = lm[0] + eye_right = lm[1] + mouth_avg = (lm[3] + lm[4]) * 0.5 + elif lm.shape[0] == 5 and lm_type == 'dlib_5': + lm_eye_left = lm[2:4] + lm_eye_right = lm[0:2] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = lm[4] + elif lm.shape[0] == 68: + lm_eye_left = lm[36:42] + lm_eye_right = lm[42:48] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[48] + lm[54]) * 0.5 + elif lm.shape[0] == 98: + lm_eye_left = lm[60:68] + lm_eye_right = lm[68:76] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[76] + lm[82]) * 0.5 + + eye_avg = (eye_left + eye_right) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1 # TODO: you can edit it to get larger rect + x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + x *= shrink_ratio[1] # width + y *= shrink_ratio[0] # height + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + + quad_ori = np.copy(quad) + # Shrink, for large face + # TODO: do we really need shrink + shrink = int(np.floor(qsize / output_size * 0.5)) + if shrink > 1: + h, w = img.shape[0:2] + rsize = (int(np.rint(float(w) / shrink)), int(np.rint(float(h) / shrink))) + img = cv2.resize(img, rsize, interpolation=cv2.INTER_AREA) + quad /= shrink + qsize /= shrink + + # Crop + h, w = img.shape[0:2] + border = max(int(np.rint(qsize * 0.1)), 3) + crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, w), min(crop[3] + border, h)) + if crop[2] - crop[0] < w or crop[3] - crop[1] < h: + img = img[crop[1]:crop[3], crop[0]:crop[2], :] + quad -= crop[0:2] + + # Pad + # pad: (width_left, height_top, width_right, height_bottom) + h, w = img.shape[0:2] + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - w + border, 0), max(pad[3] - h + border, 0)) + if enable_padding and max(pad) > border - 4: + pad = np.maximum(pad, int(np.rint(qsize * 0.3))) + img = np.pad(img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') + h, w = img.shape[0:2] + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), + 1.0 - np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = int(qsize * 0.02) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(img, 0, ksize=(blur, blur)) + + img = img.astype('float32') + img += (blur_img - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0) + img = np.clip(img, 0, 255) # float32, [0, 255] + quad += pad[:2] + + # Transform use cv2 + h_ratio = shrink_ratio[0] / shrink_ratio[1] + dst_h, dst_w = int(transform_size * h_ratio), transform_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D(quad, template, method=cv2.LMEDS)[0] + cropped_face = cv2.warpAffine( + img, affine_matrix, (dst_w, dst_h), borderMode=cv2.BORDER_CONSTANT, borderValue=(135, 133, 132)) # gray + + if output_size < transform_size: + cropped_face = cv2.resize( + cropped_face, (output_size, int(output_size * h_ratio)), interpolation=cv2.INTER_LINEAR) + + if return_inverse_affine: + dst_h, dst_w = int(output_size * h_ratio), output_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D( + quad_ori, np.array([[0, 0], [0, output_size], [dst_w, dst_h], [dst_w, 0]]), method=cv2.LMEDS)[0] + inverse_affine = cv2.invertAffineTransform(affine_matrix) + else: + inverse_affine = None + return cropped_face, inverse_affine + + +def paste_face_back(img, face, inverse_affine): + h, w = img.shape[0:2] + face_h, face_w = face.shape[0:2] + inv_restored = cv2.warpAffine(face, inverse_affine, (w, h)) + mask = np.ones((face_h, face_w, 3), dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w, h)) + # remove the black borders + inv_mask_erosion = cv2.erode(inv_mask, np.ones((2, 2), np.uint8)) + inv_restored_remove_border = inv_mask_erosion * inv_restored + total_face_area = np.sum(inv_mask_erosion) // 3 + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + img = inv_soft_mask * inv_restored_remove_border + (1 - inv_soft_mask) * img + # float32, [0, 255] + return img + + +if __name__ == '__main__': + import os + + from imaginairy.vendored.facexlib.detection import init_detection_model + from imaginairy.vendored.facexlib.utils.face_restoration_helper import get_largest_face + from imaginairy.vendored.facexlib.visualization import visualize_detection + + img_path = '/home/wxt/datasets/ffhq/ffhq_wild/00009.png' + img_name = os.splitext(os.path.basename(img_path))[0] + + # initialize model + det_net = init_detection_model('retinaface_resnet50', half=False) + img_ori = cv2.imread(img_path) + h, w = img_ori.shape[0:2] + # if larger than 800, scale it + scale = max(h / 800, w / 800) + if scale > 1: + img = cv2.resize(img_ori, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_LINEAR) + + with torch.no_grad(): + bboxes = det_net.detect_faces(img, 0.97) + if scale > 1: + bboxes *= scale # the score is incorrect + bboxes = get_largest_face(bboxes, h, w)[0] + visualize_detection(img_ori, [bboxes], f'tmp/{img_name}_det.png') + + landmarks = np.array([[bboxes[i], bboxes[i + 1]] for i in range(5, 15, 2)]) + + cropped_face, inverse_affine = align_crop_face_landmarks( + img_ori, + landmarks, + output_size=512, + transform_size=None, + enable_padding=True, + return_inverse_affine=True, + shrink_ratio=(1, 1)) + + cv2.imwrite(f'tmp/{img_name}_cropeed_face.png', cropped_face) + img = paste_face_back(img_ori, cropped_face, inverse_affine) + cv2.imwrite(f'tmp/{img_name}_back.png', img) diff --git a/imaginairy/vendored/facexlib/utils/misc.py b/imaginairy/vendored/facexlib/utils/misc.py new file mode 100644 index 0000000..b1a597c --- /dev/null +++ b/imaginairy/vendored/facexlib/utils/misc.py @@ -0,0 +1,118 @@ +import cv2 +import os +import os.path as osp +import torch +from torch.hub import download_url_to_file, get_dir +from urllib.parse import urlparse + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv's :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = os.path.abspath(os.path.dirname(file_path)) + os.makedirs(dir_name, exist_ok=True) + return cv2.imwrite(file_path, img, params) + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + if img.dtype == 'float64': + img = img.astype('float32') + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) + + +def load_file_from_url(url, model_dir=None, progress=True, file_name=None, save_dir=None): + """Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py + """ + if model_dir is None: + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, 'checkpoints') + + if save_dir is None: + save_dir = os.path.join(ROOT_DIR, model_dir) + os.makedirs(save_dir, exist_ok=True) + + parts = urlparse(url) + filename = os.path.basename(parts.path) + if file_name is not None: + filename = file_name + cached_file = os.path.abspath(os.path.join(save_dir, filename)) + if not os.path.exists(cached_file): + print(f'Downloading: "{url}" to {cached_file}\n') + download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) + return cached_file + + +def scandir(dir_path, suffix=None, recursive=False, full_path=False): + """Scan a directory to find the interested files. + Args: + dir_path (str): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + full_path (bool, optional): If set to True, include the dir_path. + Default: False. + Returns: + A generator for all the interested files with relative paths. + """ + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + root = dir_path + + def _scandir(dir_path, suffix, recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + if full_path: + return_path = entry.path + else: + return_path = osp.relpath(entry.path, root) + + if suffix is None: + yield return_path + elif return_path.endswith(suffix): + yield return_path + else: + if recursive: + yield from _scandir(entry.path, suffix=suffix, recursive=recursive) + else: + continue + + return _scandir(dir_path, suffix=suffix, recursive=recursive) diff --git a/imaginairy/vendored/facexlib/visualization/__init__.py b/imaginairy/vendored/facexlib/visualization/__init__.py new file mode 100644 index 0000000..290fee7 --- /dev/null +++ b/imaginairy/vendored/facexlib/visualization/__init__.py @@ -0,0 +1,5 @@ +from .vis_alignment import visualize_alignment +from .vis_detection import visualize_detection +from .vis_headpose import visualize_headpose + +__all__ = ['visualize_detection', 'visualize_alignment', 'visualize_headpose'] diff --git a/imaginairy/vendored/facexlib/visualization/vis_alignment.py b/imaginairy/vendored/facexlib/visualization/vis_alignment.py new file mode 100644 index 0000000..a98be8c --- /dev/null +++ b/imaginairy/vendored/facexlib/visualization/vis_alignment.py @@ -0,0 +1,18 @@ +import cv2 +import numpy as np + + +def visualize_alignment(img, landmarks, save_path=None, to_bgr=False): + img = np.copy(img) + h, w = img.shape[0:2] + circle_size = int(max(h, w) / 150) + if to_bgr: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + for landmarks_face in landmarks: + for lm in landmarks_face: + cv2.circle(img, (int(lm[0]), int(lm[1])), 1, (0, 150, 0), circle_size) + + # save img + if save_path is not None: + cv2.imwrite(save_path, img) diff --git a/imaginairy/vendored/facexlib/visualization/vis_detection.py b/imaginairy/vendored/facexlib/visualization/vis_detection.py new file mode 100644 index 0000000..495f6c4 --- /dev/null +++ b/imaginairy/vendored/facexlib/visualization/vis_detection.py @@ -0,0 +1,29 @@ +import cv2 +import numpy as np + + +def visualize_detection(img, bboxes_and_landmarks, save_path=None, to_bgr=False): + """Visualize detection results. + + Args: + img (Numpy array): Input image. CHW, BGR, [0, 255], uint8. + """ + img = np.copy(img) + if to_bgr: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + for b in bboxes_and_landmarks: + # confidence + cv2.putText(img, f'{b[4]:.4f}', (int(b[0]), int(b[1] + 12)), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) + # bounding boxes + b = list(map(int, b)) + cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) + # landmarks (for retinaface) + cv2.circle(img, (b[5], b[6]), 1, (0, 0, 255), 4) + cv2.circle(img, (b[7], b[8]), 1, (0, 255, 255), 4) + cv2.circle(img, (b[9], b[10]), 1, (255, 0, 255), 4) + cv2.circle(img, (b[11], b[12]), 1, (0, 255, 0), 4) + cv2.circle(img, (b[13], b[14]), 1, (255, 0, 0), 4) + # save img + if save_path is not None: + cv2.imwrite(save_path, img) diff --git a/imaginairy/vendored/facexlib/visualization/vis_headpose.py b/imaginairy/vendored/facexlib/visualization/vis_headpose.py new file mode 100644 index 0000000..1ee1584 --- /dev/null +++ b/imaginairy/vendored/facexlib/visualization/vis_headpose.py @@ -0,0 +1,91 @@ +import cv2 +import numpy as np +from math import cos, sin + + +def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size=100): + """draw head pose axis.""" + + pitch = pitch * np.pi / 180 + yaw = -yaw * np.pi / 180 + roll = roll * np.pi / 180 + + if tdx is None or tdy is None: + height, width = img.shape[:2] + tdx = width / 2 + tdy = height / 2 + + # X axis pointing to right, drawn in red + x1 = size * (cos(yaw) * cos(roll)) + tdx + y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy + # Y axis pointing downside, drawn in green + x2 = size * (-cos(yaw) * sin(roll)) + tdx + y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy + # Z axis, out of the screen, drawn in blue + x3 = size * (sin(yaw)) + tdx + y3 = size * (-cos(yaw) * sin(pitch)) + tdy + + cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (0, 0, 255), 3) + cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0, 255, 0), 3) + cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (255, 0, 0), 2) + + return img + + +def draw_pose_cube(img, yaw, pitch, roll, tdx=None, tdy=None, size=150.): + """draw head pose cube. + Where (tdx, tdy) is the translation of the face. + For pose we have [pitch yaw roll tdx tdy tdz scale_factor] + """ + + p = pitch * np.pi / 180 + y = -yaw * np.pi / 180 + r = roll * np.pi / 180 + if tdx is not None and tdy is not None: + face_x = tdx - 0.50 * size + face_y = tdy - 0.50 * size + else: + height, width = img.shape[:2] + face_x = width / 2 - 0.5 * size + face_y = height / 2 - 0.5 * size + + x1 = size * (cos(y) * cos(r)) + face_x + y1 = size * (cos(p) * sin(r) + cos(r) * sin(p) * sin(y)) + face_y + x2 = size * (-cos(y) * sin(r)) + face_x + y2 = size * (cos(p) * cos(r) - sin(p) * sin(y) * sin(r)) + face_y + x3 = size * (sin(y)) + face_x + y3 = size * (-cos(y) * sin(p)) + face_y + + # Draw base in red + cv2.line(img, (int(face_x), int(face_y)), (int(x1), int(y1)), (0, 0, 255), 3) + cv2.line(img, (int(face_x), int(face_y)), (int(x2), int(y2)), (0, 0, 255), 3) + cv2.line(img, (int(x2), int(y2)), (int(x2 + x1 - face_x), int(y2 + y1 - face_y)), (0, 0, 255), 3) + cv2.line(img, (int(x1), int(y1)), (int(x1 + x2 - face_x), int(y1 + y2 - face_y)), (0, 0, 255), 3) + # Draw pillars in blue + cv2.line(img, (int(face_x), int(face_y)), (int(x3), int(y3)), (255, 0, 0), 2) + cv2.line(img, (int(x1), int(y1)), (int(x1 + x3 - face_x), int(y1 + y3 - face_y)), (255, 0, 0), 2) + cv2.line(img, (int(x2), int(y2)), (int(x2 + x3 - face_x), int(y2 + y3 - face_y)), (255, 0, 0), 2) + cv2.line(img, (int(x2 + x1 - face_x), int(y2 + y1 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (255, 0, 0), 2) + # Draw top in green + cv2.line(img, (int(x3 + x1 - face_x), int(y3 + y1 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2) + cv2.line(img, (int(x2 + x3 - face_x), int(y2 + y3 - face_y)), + (int(x3 + x1 + x2 - 2 * face_x), int(y3 + y2 + y1 - 2 * face_y)), (0, 255, 0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x3 + x1 - face_x), int(y3 + y1 - face_y)), (0, 255, 0), 2) + cv2.line(img, (int(x3), int(y3)), (int(x3 + x2 - face_x), int(y3 + y2 - face_y)), (0, 255, 0), 2) + + return img + + +def visualize_headpose(img, yaw, pitch, roll, save_path=None, to_bgr=False): + img = np.copy(img) + if to_bgr: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + show_string = (f'y {yaw[0].item():.2f}, p {pitch[0].item():.2f}, ' + f'r {roll[0].item():.2f}') + cv2.putText(img, show_string, (30, img.shape[0] - 30), fontFace=1, fontScale=1, color=(0, 0, 255), thickness=2) + draw_pose_cube(img, yaw[0], pitch[0], roll[0], size=100) + draw_axis(img, yaw[0], pitch[0], roll[0], tdx=50, tdy=50, size=100) + # save img + if save_path is not None: + cv2.imwrite(save_path, img) diff --git a/requirements-dev.txt b/requirements-dev.txt index 60ca636..c635344 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -38,12 +38,8 @@ colorama==0.4.6 # via # griffe # mkdocs-material -contourpy==1.2.0 - # via matplotlib coverage==7.4.0 # via -r requirements-dev.in -cycler==0.12.1 - # via matplotlib diffusers==0.25.0 # via imaginAIry (setup.py) einops==0.7.0 @@ -52,8 +48,6 @@ exceptiongroup==1.2.0 # via # anyio # pytest -facexlib==0.3.0 - # via imaginAIry (setup.py) fastapi==0.108.0 # via imaginAIry (setup.py) filelock==3.13.1 @@ -62,10 +56,6 @@ filelock==3.13.1 # huggingface-hub # torch # transformers -filterpy==1.4.5 - # via facexlib -fonttools==4.47.0 - # via matplotlib fsspec==2023.12.2 # via # huggingface-hub @@ -86,7 +76,7 @@ httpcore==1.0.2 # via httpx httpx==0.26.0 # via -r requirements-dev.in -huggingface-hub==0.20.1 +huggingface-hub==0.20.2 # via # diffusers # open-clip-torch @@ -112,12 +102,8 @@ jinja2==3.1.2 # mkdocs-material # mkdocstrings # torch -kiwisolver==1.4.5 - # via matplotlib kornia==0.7.1 # via imaginAIry (setup.py) -llvmlite==0.41.1 - # via numba markdown==3.5.1 # via # mkdocs @@ -131,10 +117,6 @@ markupsafe==2.1.3 # jinja2 # mkdocs # mkdocstrings -matplotlib==3.7.4 - # via - # -c tests/constraints.txt - # filterpy mergedeep==1.3.4 # via mkdocs mkdocs==1.5.3 @@ -164,20 +146,13 @@ mypy-extensions==1.0.0 # via mypy networkx==3.2.1 # via torch -numba==0.58.1 - # via facexlib numpy==1.24.4 # via # -c tests/constraints.txt - # contourpy # diffusers - # facexlib - # filterpy # imageio # imaginAIry (setup.py) # jaxtyping - # matplotlib - # numba # opencv-python # scipy # torchvision @@ -187,14 +162,11 @@ omegaconf==2.3.0 open-clip-torch==2.23.0 # via imaginAIry (setup.py) opencv-python==4.9.0.80 - # via - # facexlib - # imaginAIry (setup.py) + # via imaginAIry (setup.py) packaging==23.2 # via # huggingface-hub # kornia - # matplotlib # mkdocs # pytest # pytest-sugar @@ -206,10 +178,8 @@ pathspec==0.12.1 pillow==10.2.0 # via # diffusers - # facexlib # imageio # imaginAIry (setup.py) - # matplotlib # torchvision platformdirs==4.1.0 # via @@ -236,7 +206,7 @@ pymdown-extensions==10.7 # mkdocs-material # mkdocstrings pyparsing==3.1.1 - # via matplotlib + # via imaginAIry (setup.py) pytest==7.4.4 # via # -r requirements-dev.in @@ -250,9 +220,7 @@ pytest-randomly==3.15.0 pytest-sugar==0.9.7 # via -r requirements-dev.in python-dateutil==2.8.2 - # via - # ghp-import - # matplotlib + # via ghp-import pyyaml==6.0.1 # via # huggingface-hub @@ -292,8 +260,6 @@ safetensors==0.4.1 # transformers scipy==1.10.1 # via - # facexlib - # filterpy # imaginAIry (setup.py) # torchdiffeq sentencepiece==0.1.99 @@ -324,7 +290,6 @@ tomli==2.0.1 # pytest torch==2.1.2 # via - # facexlib # imaginAIry (setup.py) # kornia # open-clip-torch @@ -335,13 +300,11 @@ torchdiffeq==0.2.3 # via imaginAIry (setup.py) torchvision==0.16.2 # via - # facexlib # imaginAIry (setup.py) # open-clip-torch # timm tqdm==4.66.1 # via - # facexlib # huggingface-hub # imaginAIry (setup.py) # open-clip-torch @@ -350,13 +313,13 @@ transformers==4.36.2 # via imaginAIry (setup.py) typeguard==2.13.3 # via jaxtyping -types-pillow==10.1.0.2 +types-pillow==10.1.0.20240106 # via -r requirements-dev.in -types-psutil==5.9.5.17 +types-psutil==5.9.5.20240106 # via -r requirements-dev.in -types-requests==2.31.0.20231231 +types-requests==2.31.0.20240106 # via -r requirements-dev.in -types-tqdm==4.66.0.5 +types-tqdm==4.66.0.20240106 # via -r requirements-dev.in typing-extensions==4.9.0 # via @@ -378,7 +341,7 @@ uvicorn==0.25.0 # via imaginAIry (setup.py) watchdog==3.0.0 # via mkdocs -wcwidth==0.2.12 +wcwidth==0.2.13 # via ftfy wheel==0.42.0 # via -r requirements-dev.in diff --git a/setup.py b/setup.py index f22c40a..7c8fd07 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,6 @@ setup( "click-help-colors>=0.9.1", "click-shell>=2.0", "protobuf != 3.20.2, != 3.19.5", - "facexlib>=0.2.1.1", "fastapi>=0.70.0", "ftfy>=6.0.1", # for vendored clip "torch>=2.1.0", @@ -94,6 +93,8 @@ setup( "opencv-python>=4.4.0.46", # need to migration to 2.0 "pydantic>=2.3.0", + # pyparsing used for masking logic and creating text images + "pyparsing>=3.0.0", "requests>=2.28.1", # "refiners>=0.2.0", "jaxtyping>=0.2.23", # refiners dependency