-
Notifications
You must be signed in to change notification settings - Fork 192
Modernize codebase: auto-detect GPU/CPU, replace print with logging, update deps #78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,10 +1,10 @@ | ||||||
| #!/usr/bin/python | ||||||
| #!/usr/bin/env python3 | ||||||
| #-*- coding: utf-8 -*- | ||||||
| # Video 25 FPS, Audio 16000HZ | ||||||
|
|
||||||
| import torch | ||||||
| import numpy | ||||||
| import time, pdb, argparse, subprocess, os, math, glob | ||||||
| import time, pdb, argparse, subprocess, os, math, glob, logging | ||||||
| import cv2 | ||||||
| import python_speech_features | ||||||
|
|
||||||
|
|
@@ -13,11 +13,13 @@ | |||||
| from SyncNetModel import * | ||||||
| from shutil import rmtree | ||||||
|
|
||||||
| logger = logging.getLogger(__name__) | ||||||
|
|
||||||
|
|
||||||
| # ==================== Get OFFSET ==================== | ||||||
|
|
||||||
| def calc_pdist(feat1, feat2, vshift=10): | ||||||
|
|
||||||
| win_size = vshift*2+1 | ||||||
|
|
||||||
| feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) | ||||||
|
|
@@ -34,14 +36,16 @@ def calc_pdist(feat1, feat2, vshift=10): | |||||
|
|
||||||
| class SyncNetInstance(torch.nn.Module): | ||||||
|
|
||||||
| def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): | ||||||
| super(SyncNetInstance, self).__init__(); | ||||||
| def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024, device=None): | ||||||
| super().__init__() | ||||||
|
|
||||||
| self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); | ||||||
| self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu') | ||||||
| logger.info('Using device: %s', self.device) | ||||||
| self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).to(self.device) | ||||||
|
|
||||||
| def evaluate(self, opt, videofile): | ||||||
|
|
||||||
| self.__S__.eval(); | ||||||
| self.__S__.eval() | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Convert files | ||||||
|
|
@@ -52,18 +56,21 @@ def evaluate(self, opt, videofile): | |||||
|
|
||||||
| os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) | ||||||
|
|
||||||
| command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) | ||||||
| output = subprocess.call(command, shell=True, stdout=None) | ||||||
| command = ["ffmpeg", "-y", "-i", videofile, "-threads", "1", "-f", "image2", | ||||||
| os.path.join(opt.tmp_dir, opt.reference, '%06d.jpg')] | ||||||
| subprocess.run(command, check=True) | ||||||
|
|
||||||
| command = ["ffmpeg", "-y", "-i", videofile, "-async", "1", "-ac", "1", "-vn", | ||||||
| "-acodec", "pcm_s16le", "-ar", "16000", | ||||||
| os.path.join(opt.tmp_dir, opt.reference, 'audio.wav')] | ||||||
| subprocess.run(command, check=True) | ||||||
|
|
||||||
| command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) | ||||||
| output = subprocess.call(command, shell=True, stdout=None) | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Load video | ||||||
| # Load video | ||||||
| # ========== ========== | ||||||
|
|
||||||
| images = [] | ||||||
|
|
||||||
| flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) | ||||||
| flist.sort() | ||||||
|
|
||||||
|
|
@@ -74,7 +81,7 @@ def evaluate(self, opt, videofile): | |||||
| im = numpy.expand_dims(im,axis=0) | ||||||
| im = numpy.transpose(im,(0,3,4,1,2)) | ||||||
|
|
||||||
| imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) | ||||||
| imtv = torch.from_numpy(im.astype(float)).float() | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Load audio | ||||||
|
|
@@ -85,17 +92,17 @@ def evaluate(self, opt, videofile): | |||||
| mfcc = numpy.stack([numpy.array(i) for i in mfcc]) | ||||||
|
|
||||||
| cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) | ||||||
| cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) | ||||||
| cct = torch.from_numpy(cc.astype(float)).float() | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Check audio and video input length | ||||||
| # ========== ========== | ||||||
|
|
||||||
| if (float(len(audio))/16000) != (float(len(images))/25) : | ||||||
| print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) | ||||||
| logger.warning("Audio (%.4fs) and video (%.4fs) lengths are different.",float(len(audio))/16000,float(len(images))/25) | ||||||
|
|
||||||
| min_length = min(len(images),math.floor(len(audio)/640)) | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Generate video and audio feats | ||||||
| # ========== ========== | ||||||
|
|
@@ -106,15 +113,15 @@ def evaluate(self, opt, videofile): | |||||
|
|
||||||
| tS = time.time() | ||||||
| for i in range(0,lastframe,opt.batch_size): | ||||||
|
|
||||||
| im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] | ||||||
| im_in = torch.cat(im_batch,0) | ||||||
| im_out = self.__S__.forward_lip(im_in.cuda()); | ||||||
| im_out = self.__S__.forward_lip(im_in.to(self.device)) | ||||||
| im_feat.append(im_out.data.cpu()) | ||||||
|
|
||||||
| cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] | ||||||
| cc_in = torch.cat(cc_batch,0) | ||||||
| cc_out = self.__S__.forward_aud(cc_in.cuda()) | ||||||
| cc_out = self.__S__.forward_aud(cc_in.to(self.device)) | ||||||
| cc_feat.append(cc_out.data.cpu()) | ||||||
|
|
||||||
| im_feat = torch.cat(im_feat,0) | ||||||
|
|
@@ -123,8 +130,8 @@ def evaluate(self, opt, videofile): | |||||
| # ========== ========== | ||||||
| # Compute offset | ||||||
| # ========== ========== | ||||||
| print('Compute time %.3f sec.' % (time.time()-tS)) | ||||||
|
|
||||||
| logger.info('Compute time %.3f sec.', time.time()-tS) | ||||||
|
|
||||||
| dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) | ||||||
| mdist = torch.mean(torch.stack(dists,1),1) | ||||||
|
|
@@ -138,25 +145,27 @@ def evaluate(self, opt, videofile): | |||||
| # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) | ||||||
| fconf = torch.median(mdist).numpy() - fdist | ||||||
| fconfm = signal.medfilt(fconf,kernel_size=9) | ||||||
|
|
||||||
| numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) | ||||||
| print('Framewise conf: ') | ||||||
| print(fconfm) | ||||||
| print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) | ||||||
| logger.info('Framewise conf: ') | ||||||
| logger.info(fconfm) | ||||||
| logger.info('AV offset: \t%d', offset.item()) | ||||||
| logger.info('Min dist: \t%.3f', minval.item()) | ||||||
| logger.info('Confidence: \t%.3f', conf.item()) | ||||||
|
|
||||||
| dists_npy = numpy.array([ dist.numpy() for dist in dists ]) | ||||||
| return offset.numpy(), conf.numpy(), dists_npy | ||||||
|
|
||||||
| def extract_feature(self, opt, videofile): | ||||||
|
|
||||||
| self.__S__.eval(); | ||||||
| self.__S__.eval() | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Load video | ||||||
| # Load video | ||||||
| # ========== ========== | ||||||
| cap = cv2.VideoCapture(videofile) | ||||||
|
|
||||||
| frame_num = 1; | ||||||
| frame_num = 1 | ||||||
| images = [] | ||||||
| while frame_num: | ||||||
| frame_num += 1 | ||||||
|
|
@@ -170,8 +179,8 @@ def extract_feature(self, opt, videofile): | |||||
| im = numpy.expand_dims(im,axis=0) | ||||||
| im = numpy.transpose(im,(0,3,4,1,2)) | ||||||
|
|
||||||
| imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) | ||||||
| imtv = torch.from_numpy(im.astype(float)).float() | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Generate video feats | ||||||
| # ========== ========== | ||||||
|
|
@@ -181,28 +190,28 @@ def extract_feature(self, opt, videofile): | |||||
|
|
||||||
| tS = time.time() | ||||||
| for i in range(0,lastframe,opt.batch_size): | ||||||
|
|
||||||
| im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] | ||||||
| im_in = torch.cat(im_batch,0) | ||||||
| im_out = self.__S__.forward_lipfeat(im_in.cuda()); | ||||||
| im_out = self.__S__.forward_lipfeat(im_in.to(self.device)) | ||||||
| im_feat.append(im_out.data.cpu()) | ||||||
|
|
||||||
| im_feat = torch.cat(im_feat,0) | ||||||
|
|
||||||
| # ========== ========== | ||||||
| # Compute offset | ||||||
| # ========== ========== | ||||||
| print('Compute time %.3f sec.' % (time.time()-tS)) | ||||||
|
|
||||||
| logger.info('Compute time %.3f sec.', time.time()-tS) | ||||||
|
|
||||||
| return im_feat | ||||||
|
|
||||||
|
|
||||||
| def loadParameters(self, path): | ||||||
| loaded_state = torch.load(path, map_location=lambda storage, loc: storage); | ||||||
| loaded_state = torch.load(path, map_location=lambda storage, loc: storage, weights_only=True) | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Setting
Suggested change
|
||||||
|
|
||||||
| self_state = self.__S__.state_dict(); | ||||||
| self_state = self.__S__.state_dict() | ||||||
|
|
||||||
| for name, param in loaded_state.items(): | ||||||
|
|
||||||
| self_state[name].copy_(param); | ||||||
| self_state[name].copy_(param) | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comparing floating-point values for exact equality is generally unreliable due to precision issues. It is safer to use
math.isclosewith an appropriate tolerance, especially when comparing durations derived from different sources (audio samples vs. video frames).