diff --git a/README.md b/README.md index 7da5354..d3473b0 100755 --- a/README.md +++ b/README.md @@ -7,12 +7,18 @@ This repository contains the demo for the audio-to-video synchronisation network Please cite the paper below if you make use of the software. ## Dependencies + ``` -pip install -r requirements.txt +conda env create -f environment.yml ``` -In addition, `ffmpeg` is required. +## Getting Started + +Download the pretrained model: +``` +sh download_model.sh +``` ## Demo @@ -21,16 +27,17 @@ SyncNet demo: python demo_syncnet.py --videofile data/example.avi --tmp_dir /path/to/temp/directory ``` -Check that this script returns: +Check that this script returns approximately the following values (minor differences are expected depending on your platform and package versions): ``` AV offset: 3 Min dist: 5.353 Confidence: 10.021 ``` -Full pipeline: +## Full Pipeline + +Run the three stages — face detection and tracking, sync offset estimation, and visualisation: ``` -sh download_model.sh python run_pipeline.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output python run_syncnet.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output @@ -39,7 +46,6 @@ python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video Outputs: ``` $DATA_DIR/pycrop/$REFERENCE/*.avi - cropped face tracks -$DATA_DIR/pywork/$REFERENCE/offsets.txt - audio-video offset values $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below) ```
diff --git a/SyncNetInstance.py b/SyncNetInstance.py index 497d44f..d23e1b4 100644 --- a/SyncNetInstance.py +++ b/SyncNetInstance.py @@ -1,10 +1,10 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- # Video 25 FPS, Audio 16000HZ import torch import numpy -import time, pdb, argparse, subprocess, os, math, glob +import time, pdb, argparse, subprocess, os, math, glob, logging import cv2 import python_speech_features @@ -13,11 +13,13 @@ from SyncNetModel import * from shutil import rmtree +logger = logging.getLogger(__name__) + # ==================== Get OFFSET ==================== def calc_pdist(feat1, feat2, vshift=10): - + win_size = vshift*2+1 feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) @@ -34,14 +36,16 @@ def calc_pdist(feat1, feat2, vshift=10): class SyncNetInstance(torch.nn.Module): - def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): - super(SyncNetInstance, self).__init__(); + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024, device=None): + super().__init__() - self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') + logger.info('Using device: %s', self.device) + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).to(self.device) def evaluate(self, opt, videofile): - self.__S__.eval(); + self.__S__.eval() # ========== ========== # Convert files @@ -52,18 +56,21 @@ def evaluate(self, opt, videofile): os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) - command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) - output = subprocess.call(command, shell=True, stdout=None) + command = ["ffmpeg", "-y", "-i", videofile, "-threads", "1", "-f", "image2", + os.path.join(opt.tmp_dir, opt.reference, '%06d.jpg')] + subprocess.run(command, check=True) + + command = ["ffmpeg", "-y", "-i", videofile, "-async", "1", "-ac", "1", "-vn", + "-acodec", "pcm_s16le", "-ar", "16000", + os.path.join(opt.tmp_dir, opt.reference, 'audio.wav')] + subprocess.run(command, check=True) - command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) - output = subprocess.call(command, shell=True, stdout=None) - # ========== ========== - # Load video + # Load video # ========== ========== images = [] - + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) flist.sort() @@ -74,7 +81,7 @@ def evaluate(self, opt, videofile): im = numpy.expand_dims(im,axis=0) im = numpy.transpose(im,(0,3,4,1,2)) - imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + imtv = torch.from_numpy(im.astype(float)).float() # ========== ========== # Load audio @@ -85,17 +92,17 @@ def evaluate(self, opt, videofile): mfcc = numpy.stack([numpy.array(i) for i in mfcc]) cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) - cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + cct = torch.from_numpy(cc.astype(float)).float() # ========== ========== # Check audio and video input length # ========== ========== if (float(len(audio))/16000) != (float(len(images))/25) : - print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + logger.warning("Audio (%.4fs) and video (%.4fs) lengths are different.",float(len(audio))/16000,float(len(images))/25) min_length = min(len(images),math.floor(len(audio)/640)) - + # ========== ========== # Generate video and audio feats # ========== ========== @@ -106,15 +113,15 @@ def evaluate(self, opt, videofile): tS = time.time() for i in range(0,lastframe,opt.batch_size): - + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] im_in = torch.cat(im_batch,0) - im_out = self.__S__.forward_lip(im_in.cuda()); + im_out = self.__S__.forward_lip(im_in.to(self.device)) im_feat.append(im_out.data.cpu()) cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] cc_in = torch.cat(cc_batch,0) - cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_out = self.__S__.forward_aud(cc_in.to(self.device)) cc_feat.append(cc_out.data.cpu()) im_feat = torch.cat(im_feat,0) @@ -123,8 +130,8 @@ def evaluate(self, opt, videofile): # ========== ========== # Compute offset # ========== ========== - - print('Compute time %.3f sec.' % (time.time()-tS)) + + logger.info('Compute time %.3f sec.', time.time()-tS) dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) mdist = torch.mean(torch.stack(dists,1),1) @@ -138,25 +145,27 @@ def evaluate(self, opt, videofile): # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) fconf = torch.median(mdist).numpy() - fdist fconfm = signal.medfilt(fconf,kernel_size=9) - + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) - print('Framewise conf: ') - print(fconfm) - print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + logger.info('Framewise conf: ') + logger.info(fconfm) + logger.info('AV offset: \t%d', offset.item()) + logger.info('Min dist: \t%.3f', minval.item()) + logger.info('Confidence: \t%.3f', conf.item()) dists_npy = numpy.array([ dist.numpy() for dist in dists ]) return offset.numpy(), conf.numpy(), dists_npy def extract_feature(self, opt, videofile): - self.__S__.eval(); - + self.__S__.eval() + # ========== ========== - # Load video + # Load video # ========== ========== cap = cv2.VideoCapture(videofile) - frame_num = 1; + frame_num = 1 images = [] while frame_num: frame_num += 1 @@ -170,8 +179,8 @@ def extract_feature(self, opt, videofile): im = numpy.expand_dims(im,axis=0) im = numpy.transpose(im,(0,3,4,1,2)) - imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) - + imtv = torch.from_numpy(im.astype(float)).float() + # ========== ========== # Generate video feats # ========== ========== @@ -181,10 +190,10 @@ def extract_feature(self, opt, videofile): tS = time.time() for i in range(0,lastframe,opt.batch_size): - + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] im_in = torch.cat(im_batch,0) - im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_out = self.__S__.forward_lipfeat(im_in.to(self.device)) im_feat.append(im_out.data.cpu()) im_feat = torch.cat(im_feat,0) @@ -192,17 +201,17 @@ def extract_feature(self, opt, videofile): # ========== ========== # Compute offset # ========== ========== - - print('Compute time %.3f sec.' % (time.time()-tS)) + + logger.info('Compute time %.3f sec.', time.time()-tS) return im_feat def loadParameters(self, path): - loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + loaded_state = torch.load(path, map_location=lambda storage, loc: storage, weights_only=True) - self_state = self.__S__.state_dict(); + self_state = self.__S__.state_dict() for name, param in loaded_state.items(): - self_state[name].copy_(param); + self_state[name].copy_(param) diff --git a/SyncNetModel.py b/SyncNetModel.py index c21ce25..cf97caf 100755 --- a/SyncNetModel.py +++ b/SyncNetModel.py @@ -1,25 +1,16 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- import torch import torch.nn as nn -def save(model, filename): - with open(filename, "wb") as f: - torch.save(model, f); - print("%s saved."%filename); - -def load(filename): - net = torch.load(filename) - return net; - class S(nn.Module): def __init__(self, num_layers_in_fc_layers = 1024): - super(S, self).__init__(); + super().__init__() - self.__nFeatures__ = 24; - self.__nChs__ = 32; - self.__midChs__ = 32; + self.__nFeatures__ = 24 + self.__nChs__ = 32 + self.__midChs__ = 32 self.netcnnaud = nn.Sequential( nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), @@ -44,25 +35,25 @@ def __init__(self, num_layers_in_fc_layers = 1024): nn.BatchNorm2d(256), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), - + nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)), nn.BatchNorm2d(512), nn.ReLU(), - ); + ) self.netfcaud = nn.Sequential( nn.Linear(512, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, num_layers_in_fc_layers), - ); + ) self.netfclip = nn.Sequential( nn.Linear(512, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, num_layers_in_fc_layers), - ); + ) self.netcnnlip = nn.Sequential( nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0), @@ -91,27 +82,27 @@ def __init__(self, num_layers_in_fc_layers = 1024): nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0), nn.BatchNorm3d(512), nn.ReLU(inplace=True), - ); + ) def forward_aud(self, x): - mid = self.netcnnaud(x); # N x ch x 24 x M - mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) - out = self.netfcaud(mid); + mid = self.netcnnaud(x) # N x ch x 24 x M + mid = mid.view((mid.size(0), -1)) # N x (ch x 24) + out = self.netfcaud(mid) - return out; + return out def forward_lip(self, x): - mid = self.netcnnlip(x); - mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) - out = self.netfclip(mid); + mid = self.netcnnlip(x) + mid = mid.view((mid.size(0), -1)) # N x (ch x 24) + out = self.netfclip(mid) - return out; + return out def forward_lipfeat(self, x): - mid = self.netcnnlip(x); - out = mid.view((mid.size()[0], -1)); # N x (ch x 24) + mid = self.netcnnlip(x) + out = mid.view((mid.size(0), -1)) # N x (ch x 24) - return out; \ No newline at end of file + return out diff --git a/demo_feature.py b/demo_feature.py index e3bd290..416acfc 100755 --- a/demo_feature.py +++ b/demo_feature.py @@ -1,31 +1,34 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- -import time, pdb, argparse, subprocess +import time, pdb, argparse, subprocess, logging from SyncNetInstance import * +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + # ==================== LOAD PARAMS ==================== -parser = argparse.ArgumentParser(description = "SyncNet"); +parser = argparse.ArgumentParser(description = "SyncNet") -parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); -parser.add_argument('--vshift', type=int, default='15', help=''); -parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); -parser.add_argument('--tmp_dir', type=str, default="data", help=''); -parser.add_argument('--save_as', type=str, default="data/features.pt", help=''); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='') +parser.add_argument('--batch_size', type=int, default='20', help='') +parser.add_argument('--vshift', type=int, default='15', help='') +parser.add_argument('--videofile', type=str, default="data/example.avi", help='') +parser.add_argument('--tmp_dir', type=str, default="data", help='') +parser.add_argument('--save_as', type=str, default="data/features.pt", help='') -opt = parser.parse_args(); +opt = parser.parse_args() # ==================== RUN EVALUATION ==================== -s = SyncNetInstance(); +s = SyncNetInstance() -s.loadParameters(opt.initial_model); -print("Model %s loaded."%opt.initial_model); +s.loadParameters(opt.initial_model) +logger.info("Model %s loaded.", opt.initial_model) feats = s.extract_feature(opt, videofile=opt.videofile) diff --git a/demo_syncnet.py b/demo_syncnet.py index 01c25a6..8826b0a 100755 --- a/demo_syncnet.py +++ b/demo_syncnet.py @@ -1,30 +1,33 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- -import time, pdb, argparse, subprocess +import time, pdb, argparse, subprocess, logging from SyncNetInstance import * +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + # ==================== LOAD PARAMS ==================== -parser = argparse.ArgumentParser(description = "SyncNet"); +parser = argparse.ArgumentParser(description = "SyncNet") -parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); -parser.add_argument('--vshift', type=int, default='15', help=''); -parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); -parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); -parser.add_argument('--reference', type=str, default="demo", help=''); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='') +parser.add_argument('--batch_size', type=int, default='20', help='') +parser.add_argument('--vshift', type=int, default='15', help='') +parser.add_argument('--videofile', type=str, default="data/example.avi", help='') +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='') +parser.add_argument('--reference', type=str, default="demo", help='') -opt = parser.parse_args(); +opt = parser.parse_args() # ==================== RUN EVALUATION ==================== -s = SyncNetInstance(); +s = SyncNetInstance() -s.loadParameters(opt.initial_model); -print("Model %s loaded."%opt.initial_model); +s.loadParameters(opt.initial_model) +logger.info("Model %s loaded.", opt.initial_model) s.evaluate(opt, videofile=opt.videofile) diff --git a/detectors/s3fd/__init__.py b/detectors/s3fd/__init__.py index d7f35e0..3c61da7 100644 --- a/detectors/s3fd/__init__.py +++ b/detectors/s3fd/__init__.py @@ -1,11 +1,12 @@ -import time +import time, logging import numpy as np import cv2 import torch -from torchvision import transforms from .nets import S3FDNet from .box_utils import nms_ +logger = logging.getLogger(__name__) + PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth' img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') @@ -17,12 +18,12 @@ def __init__(self, device='cuda'): tstamp = time.time() self.device = device - print('[S3FD] loading with', self.device) + logger.info('[S3FD] loading with %s', self.device) self.net = S3FDNet(device=self.device).to(self.device) - state_dict = torch.load(PATH_WEIGHT, map_location=self.device) + state_dict = torch.load(PATH_WEIGHT, map_location=self.device, weights_only=True) self.net.load_state_dict(state_dict) self.net.eval() - print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + logger.info('[S3FD] finished loading (%.4f sec)', time.time() - tstamp) def detect_faces(self, image, conf_th=0.8, scales=[1]): diff --git a/detectors/s3fd/box_utils.py b/detectors/s3fd/box_utils.py index 0779bcd..00686a2 100644 --- a/detectors/s3fd/box_utils.py +++ b/detectors/s3fd/box_utils.py @@ -35,7 +35,7 @@ def nms_(dets, thresh): inds = np.where(ovr <= thresh)[0] order = order[inds + 1] - return np.array(keep).astype(np.int) + return np.array(keep).astype(np.intp) def decode(loc, priors, variances): @@ -82,45 +82,32 @@ def nms(boxes, scores, overlap=0.5, top_k=200): v, idx = scores.sort(0) # sort in ascending order # I = I[v >= 0.01] idx = idx[-top_k:] # indices of the top-k largest vals - xx1 = boxes.new() - yy1 = boxes.new() - xx2 = boxes.new() - yy2 = boxes.new() - w = boxes.new() - h = boxes.new() - - # keep = torch.Tensor() + count = 0 while idx.numel() > 0: i = idx[-1] # index of current largest val - # keep.append(i) keep[count] = i count += 1 if idx.size(0) == 1: break idx = idx[:-1] # remove kept element from view # load bboxes of next highest vals - torch.index_select(x1, 0, idx, out=xx1) - torch.index_select(y1, 0, idx, out=yy1) - torch.index_select(x2, 0, idx, out=xx2) - torch.index_select(y2, 0, idx, out=yy2) + xx1 = torch.index_select(x1, 0, idx) + yy1 = torch.index_select(y1, 0, idx) + xx2 = torch.index_select(x2, 0, idx) + yy2 = torch.index_select(y2, 0, idx) # store element-wise max with next highest score xx1 = torch.clamp(xx1, min=x1[i]) yy1 = torch.clamp(yy1, min=y1[i]) xx2 = torch.clamp(xx2, max=x2[i]) yy2 = torch.clamp(yy2, max=y2[i]) - w.resize_as_(xx2) - h.resize_as_(yy2) - w = xx2 - xx1 - h = yy2 - yy1 - # check sizes of xx1 and xx2.. after each iteration - w = torch.clamp(w, min=0.0) - h = torch.clamp(h, min=0.0) + w = torch.clamp(xx2 - xx1, min=0.0) + h = torch.clamp(yy2 - yy1, min=0.0) inter = w * h # IoU = i / (area(a) + area(b) - i) - rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + rem_areas = torch.index_select(area, 0, idx) union = (rem_areas - inter) + area[i] - IoU = inter / union # store result in iou + IoU = inter / union # keep only elements with an IoU <= overlap idx = idx[IoU.le(overlap)] return keep, count @@ -181,7 +168,7 @@ def __init__(self, input_size, feature_maps, steps=[4, 8, 16, 32, 64, 128], clip=False): - super(PriorBox, self).__init__() + super().__init__() self.imh = input_size[0] self.imw = input_size[1] diff --git a/detectors/s3fd/nets.py b/detectors/s3fd/nets.py index 85b5c82..937a73f 100644 --- a/detectors/s3fd/nets.py +++ b/detectors/s3fd/nets.py @@ -8,7 +8,7 @@ class L2Norm(nn.Module): def __init__(self, n_channels, scale): - super(L2Norm, self).__init__() + super().__init__() self.n_channels = n_channels self.gamma = scale or None self.eps = 1e-10 @@ -28,7 +28,7 @@ def forward(self, x): class S3FDNet(nn.Module): def __init__(self, device='cuda'): - super(S3FDNet, self).__init__() + super().__init__() self.device = device self.vgg = nn.ModuleList([ diff --git a/download_model.sh b/download_model.sh index 3e3a9dc..34895d9 100755 --- a/download_model.sh +++ b/download_model.sh @@ -1,9 +1,9 @@ # SyncNet model -mkdir data +mkdir -p data wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi # For the pre-processing pipeline -mkdir detectors/s3fd/weights +mkdir -p detectors/s3fd/weights wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth \ No newline at end of file diff --git a/environment-cpu.yml b/environment-cpu.yml new file mode 100644 index 0000000..9dc3b99 --- /dev/null +++ b/environment-cpu.yml @@ -0,0 +1,27 @@ +name: syncnet +channels: + - conda-forge + - pytorch + - defaults +dependencies: + # Core Python and Math Libraries + - python=3.10 + - numpy + - scipy + + # PyTorch Ecosystem + - pytorch::pytorch==2.5.1 + - pytorch::torchvision==0.20.1 + - pytorch::torchaudio==2.5.1 + + # External Tools + - ffmpeg + + # Pip Installer + - pip + + # Pip-specific packages (Runs after Conda finishes) + - pip: + - scenedetect==0.6.7.1 + - opencv-contrib-python==4.13.0.92 + - python_speech_features==0.6 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..edd13be --- /dev/null +++ b/environment.yml @@ -0,0 +1,29 @@ +name: syncnet +channels: + - conda-forge + - pytorch + - nvidia + - defaults +dependencies: + # Core Python and Math Libraries + - python=3.10 + - numpy + - scipy + + # PyTorch Ecosystem + - pytorch::pytorch==2.5.1 + - pytorch::torchvision==0.20.1 + - pytorch::torchaudio==2.5.1 + - pytorch::pytorch-cuda=12.4 + + # External Tools + - ffmpeg + + # Pip Installer + - pip + + # Pip-specific packages (Runs after Conda finishes) + - pip: + - scenedetect==0.6.7.1 + - opencv-contrib-python==4.13.0.92 + - python_speech_features==0.6 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 8919740..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -torch>=1.4.0 -torchvision>=0.5.0 -numpy>=1.18.1 -scipy>=1.2.1 -scenedetect==0.5.1 -opencv-contrib-python -python_speech_features diff --git a/run_pipeline.py b/run_pipeline.py index f5fc22e..50fee52 100755 --- a/run_pipeline.py +++ b/run_pipeline.py @@ -1,15 +1,14 @@ -#!/usr/bin/python +#!/usr/bin/env python3 -import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 +import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2, logging import numpy as np +import torch from shutil import rmtree -import scenedetect -from scenedetect.video_manager import VideoManager -from scenedetect.scene_manager import SceneManager -from scenedetect.frame_timecode import FrameTimecode -from scenedetect.stats_manager import StatsManager -from scenedetect.detectors import ContentDetector +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +from scenedetect import open_video, SceneManager, ContentDetector from scipy.interpolate import interp1d from scipy.io import wavfile @@ -21,17 +20,17 @@ # # PARSE ARGS # ========== ========== ========== ========== -parser = argparse.ArgumentParser(description = "FaceTracker"); -parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry'); -parser.add_argument('--videofile', type=str, default='', help='Input video file'); -parser.add_argument('--reference', type=str, default='', help='Video reference'); -parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); -parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); -parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration'); -parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); -parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); -parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); -opt = parser.parse_args(); +parser = argparse.ArgumentParser(description = "FaceTracker") +parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry') +parser.add_argument('--videofile', type=str, default='', help='Input video file') +parser.add_argument('--reference', type=str, default='', help='Video reference') +parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection') +parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box') +parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration') +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate') +parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped') +parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels') +opt = parser.parse_args() setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) @@ -44,19 +43,19 @@ # ========== ========== ========== ========== def bb_intersection_over_union(boxA, boxB): - + xA = max(boxA[0], boxB[0]) yA = max(boxA[1], boxB[1]) xB = min(boxA[2], boxB[2]) yB = min(boxA[3], boxB[3]) - + interArea = max(0, xB - xA) * max(0, yB - yA) - + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) - + iou = interArea / float(boxAArea + boxBArea - interArea) - + return iou # ========== ========== ========== ========== @@ -87,7 +86,7 @@ def track_shot(opt,scenefaces): if track == []: break elif len(track) > opt.min_track: - + framenum = np.array([ f['frame'] for f in track ]) bboxes = np.array([np.array(f['bbox']) for f in track]) @@ -107,7 +106,7 @@ def track_shot(opt,scenefaces): # ========== ========== ========== ========== # # VIDEO CROP AND SAVE # ========== ========== ========== ========== - + def crop_video(opt,track,cropfile): flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) @@ -120,12 +119,12 @@ def crop_video(opt,track,cropfile): for det in track['bbox']: - dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) - dets['y'].append((det[1]+det[3])/2) # crop center x + dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x dets['x'].append((det[0]+det[2])/2) # crop center y # Smooth detections - dets['s'] = signal.medfilt(dets['s'],kernel_size=13) + dets['s'] = signal.medfilt(dets['s'],kernel_size=13) dets['x'] = signal.medfilt(dets['x'],kernel_size=13) dets['y'] = signal.medfilt(dets['y'],kernel_size=13) @@ -134,16 +133,16 @@ def crop_video(opt,track,cropfile): cs = opt.crop_scale bs = dets['s'][fidx] # Detection box size - bsi = int(bs*(1+2*cs)) # Pad videos by this amount + bsi = int(bs*(1+2*cs)) # Pad videos by this amount image = cv2.imread(flist[frame]) - + frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) my = dets['y'][fidx]+bsi # BBox center Y mx = dets['x'][fidx]+bsi # BBox center X face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] - + vOut.write(cv2.resize(face,(224,224))) audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') @@ -154,27 +153,25 @@ def crop_video(opt,track,cropfile): # ========== CROP AUDIO FILE ========== - command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) - output = subprocess.call(command, shell=True, stdout=None) - - if output != 0: - pdb.set_trace() + command = ["ffmpeg", "-y", "-i", + os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), + "-ss", "%.3f" % audiostart, "-to", "%.3f" % audioend, + audiotmp] + subprocess.run(command, check=True) sample_rate, audio = wavfile.read(audiotmp) # ========== COMBINE AUDIO AND VIDEO FILES ========== - command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) - output = subprocess.call(command, shell=True, stdout=None) - - if output != 0: - pdb.set_trace() + command = ["ffmpeg", "-y", "-i", cropfile+'t.avi', "-i", audiotmp, + "-c:v", "copy", "-c:a", "copy", cropfile+'.avi'] + subprocess.run(command, check=True) - print('Written %s'%cropfile) + logger.info('Written %s', cropfile) os.remove(cropfile+'t.avi') - print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))) + logger.info('Mean pos: x %.2f y %.2f s %.2f', np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])) return {'track':track, 'proc_track':dets} @@ -184,29 +181,30 @@ def crop_video(opt,track,cropfile): def inference_video(opt): - DET = S3FD(device='cuda') + device = 'cuda' if torch.cuda.is_available() else 'cpu' + DET = S3FD(device=device) flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) flist.sort() dets = [] - + for fidx, fname in enumerate(flist): start_time = time.time() - + image = cv2.imread(fname) image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale]) - dets.append([]); + dets.append([]) for bbox in bboxes: dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) elapsed_time = time.time() - start_time - print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) + logger.info('%s-%05d; %d dets; %.2f Hz', os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time)) savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') @@ -221,33 +219,27 @@ def inference_video(opt): def scene_detect(opt): - video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')]) - stats_manager = StatsManager() - scene_manager = SceneManager(stats_manager) - # Add ContentDetector algorithm (constructor takes detector options like threshold). - scene_manager.add_detector(ContentDetector()) - base_timecode = video_manager.get_base_timecode() - - video_manager.set_downscale_factor() - - video_manager.start() + video_path = os.path.join(opt.avi_dir,opt.reference,'video.avi') + video = open_video(video_path) - scene_manager.detect_scenes(frame_source=video_manager) + scene_manager = SceneManager() + scene_manager.add_detector(ContentDetector()) + scene_manager.detect_scenes(video) - scene_list = scene_manager.get_scene_list(base_timecode) + scene_list = scene_manager.get_scene_list() savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl') - if scene_list == []: - scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())] + if not scene_list: + scene_list = [(video.base_timecode, video.base_timecode + video.duration)] with open(savepath, 'wb') as fil: pickle.dump(scene_list, fil) - print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) + logger.info('%s - scenes detected %d', video_path, len(scene_list)) return scene_list - + # ========== ========== ========== ========== # # EXECUTE DEMO @@ -255,39 +247,31 @@ def scene_detect(opt): # ========== DELETE EXISTING DIRECTORIES ========== -if os.path.exists(os.path.join(opt.work_dir,opt.reference)): - rmtree(os.path.join(opt.work_dir,opt.reference)) - -if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): - rmtree(os.path.join(opt.crop_dir,opt.reference)) - -if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): - rmtree(os.path.join(opt.avi_dir,opt.reference)) - -if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): - rmtree(os.path.join(opt.frames_dir,opt.reference)) - -if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): - rmtree(os.path.join(opt.tmp_dir,opt.reference)) +for d in [opt.work_dir, opt.crop_dir, opt.avi_dir, opt.frames_dir, opt.tmp_dir]: + path = os.path.join(d, opt.reference) + if os.path.exists(path): + rmtree(path) # ========== MAKE NEW DIRECTORIES ========== -os.makedirs(os.path.join(opt.work_dir,opt.reference)) -os.makedirs(os.path.join(opt.crop_dir,opt.reference)) -os.makedirs(os.path.join(opt.avi_dir,opt.reference)) -os.makedirs(os.path.join(opt.frames_dir,opt.reference)) -os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) +for d in [opt.work_dir, opt.crop_dir, opt.avi_dir, opt.frames_dir, opt.tmp_dir]: + os.makedirs(os.path.join(d, opt.reference), exist_ok=True) # ========== CONVERT VIDEO AND EXTRACT FRAMES ========== -command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) -output = subprocess.call(command, shell=True, stdout=None) +command = ["ffmpeg", "-y", "-i", opt.videofile, "-qscale:v", "2", "-async", "1", "-r", "25", + os.path.join(opt.avi_dir, opt.reference, 'video.avi')] +subprocess.run(command, check=True) -command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) -output = subprocess.call(command, shell=True, stdout=None) +command = ["ffmpeg", "-y", "-i", os.path.join(opt.avi_dir, opt.reference, 'video.avi'), + "-qscale:v", "2", "-threads", "1", "-f", "image2", + os.path.join(opt.frames_dir, opt.reference, '%06d.jpg')] +subprocess.run(command, check=True) -command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) -output = subprocess.call(command, shell=True, stdout=None) +command = ["ffmpeg", "-y", "-i", os.path.join(opt.avi_dir, opt.reference, 'video.avi'), + "-ac", "1", "-vn", "-acodec", "pcm_s16le", "-ar", "16000", + os.path.join(opt.avi_dir, opt.reference, 'audio.wav')] +subprocess.run(command, check=True) # ========== FACE DETECTION ========== @@ -304,8 +288,8 @@ def scene_detect(opt): for shot in scene: - if shot[1].frame_num - shot[0].frame_num >= opt.min_track : - alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) + if shot[1].get_frames() - shot[0].get_frames() >= opt.min_track : + alltracks.extend(track_shot(opt,faces[shot[0].get_frames():shot[1].get_frames()])) # ========== FACE TRACK CROP ========== diff --git a/run_syncnet.py b/run_syncnet.py index 45099fd..dc9c2c0 100755 --- a/run_syncnet.py +++ b/run_syncnet.py @@ -1,20 +1,23 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- -import time, pdb, argparse, subprocess, pickle, os, gzip, glob +import time, pdb, argparse, subprocess, pickle, os, gzip, glob, logging from SyncNetInstance import * +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + # ==================== PARSE ARGUMENT ==================== -parser = argparse.ArgumentParser(description = "SyncNet"); -parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); -parser.add_argument('--vshift', type=int, default='15', help=''); -parser.add_argument('--data_dir', type=str, default='data/work', help=''); -parser.add_argument('--videofile', type=str, default='', help=''); -parser.add_argument('--reference', type=str, default='', help=''); -opt = parser.parse_args(); +parser = argparse.ArgumentParser(description = "SyncNet") +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='') +parser.add_argument('--batch_size', type=int, default='20', help='') +parser.add_argument('--vshift', type=int, default='15', help='') +parser.add_argument('--data_dir', type=str, default='data/work', help='') +parser.add_argument('--videofile', type=str, default='', help='') +parser.add_argument('--reference', type=str, default='', help='') +opt = parser.parse_args() setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) @@ -24,10 +27,10 @@ # ==================== LOAD MODEL AND FILE LIST ==================== -s = SyncNetInstance(); +s = SyncNetInstance() -s.loadParameters(opt.initial_model); -print("Model %s loaded."%opt.initial_model); +s.loadParameters(opt.initial_model) +logger.info("Model %s loaded.", opt.initial_model) flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) flist.sort() @@ -38,7 +41,7 @@ for idx, fname in enumerate(flist): offset, conf, dist = s.evaluate(opt,videofile=fname) dists.append(dist) - + # ==================== PRINT RESULTS TO FILE ==================== with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: diff --git a/run_visualise.py b/run_visualise.py index 85d8925..6148963 100644 --- a/run_visualise.py +++ b/run_visualise.py @@ -1,21 +1,24 @@ -#!/usr/bin/python +#!/usr/bin/env python3 #-*- coding: utf-8 -*- import torch import numpy -import time, pdb, argparse, subprocess, pickle, os, glob +import time, pdb, argparse, subprocess, pickle, os, glob, logging import cv2 +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + from scipy import signal # ==================== PARSE ARGUMENT ==================== -parser = argparse.ArgumentParser(description = "SyncNet"); -parser.add_argument('--data_dir', type=str, default='data/work', help=''); -parser.add_argument('--videofile', type=str, default='', help=''); -parser.add_argument('--reference', type=str, default='', help=''); -parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); -opt = parser.parse_args(); +parser = argparse.ArgumentParser(description = "SyncNet") +parser.add_argument('--data_dir', type=str, default='data/work', help='') +parser.add_argument('--videofile', type=str, default='', help='') +parser.add_argument('--reference', type=str, default='', help='') +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate') +opt = parser.parse_args() setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) @@ -42,8 +45,8 @@ mean_dists = numpy.mean(numpy.stack(dists[tidx],1),1) minidx = numpy.argmin(mean_dists,0) - minval = mean_dists[minidx] - + minval = mean_dists[minidx] + fdist = numpy.stack([dist[minidx] for dist in dists[tidx]]) fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=10) @@ -69,20 +72,22 @@ for face in faces[fidx]: - clr = max(min(face['conf']*25,255),0) + clr = int(max(min(face['conf']*25,255),0)) cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3) cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2) vOut.write(image) - print('Frame %d'%fidx) + logger.info('Frame %d', fidx) vOut.release() # ========== COMBINE AUDIO AND VIDEO FILES ========== -command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 -output = subprocess.call(command, shell=True, stdout=None) - - +command = ["ffmpeg", "-y", "-i", + os.path.join(opt.avi_dir, opt.reference, 'video_only.avi'), + "-i", os.path.join(opt.avi_dir, opt.reference, 'audio.wav'), + "-c:v", "copy", "-c:a", "copy", + os.path.join(opt.avi_dir, opt.reference, 'video_out.avi')] +subprocess.run(command, check=True)