diff --git a/README.md b/README.md
index 7da5354..d3473b0 100755
--- a/README.md
+++ b/README.md
@@ -7,12 +7,18 @@ This repository contains the demo for the audio-to-video synchronisation network
 Please cite the paper below if you make use of the software. 
 
 ## Dependencies
+
 ```
-pip install -r requirements.txt
+conda env create -f environment.yml
 ```
 
-In addition, `ffmpeg` is required.
 
+## Getting Started
+
+Download the pretrained model:
+```
+sh download_model.sh
+```
 
 ## Demo
 
@@ -21,16 +27,17 @@ SyncNet demo:
 python demo_syncnet.py --videofile data/example.avi --tmp_dir /path/to/temp/directory
 ```
 
-Check that this script returns:
+Check that this script returns approximately the following values (minor differences are expected depending on your platform and package versions):
 ```
 AV offset:      3 
 Min dist:       5.353
 Confidence:     10.021
 ```
 
-Full pipeline:
+## Full Pipeline
+
+Run the three stages — face detection and tracking, sync offset estimation, and visualisation:
 ```
-sh download_model.sh
 python run_pipeline.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
 python run_syncnet.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
 python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video --data_dir /path/to/output
@@ -39,7 +46,6 @@ python run_visualise.py --videofile /path/to/video.mp4 --reference name_of_video
 Outputs:
 ```
 $DATA_DIR/pycrop/$REFERENCE/*.avi - cropped face tracks
-$DATA_DIR/pywork/$REFERENCE/offsets.txt - audio-video offset values
 $DATA_DIR/pyavi/$REFERENCE/video_out.avi - output video (as shown below)
 ```
 <p align="center">
diff --git a/SyncNetInstance.py b/SyncNetInstance.py
index 497d44f..d23e1b4 100644
--- a/SyncNetInstance.py
+++ b/SyncNetInstance.py
@@ -1,10 +1,10 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 # Video 25 FPS, Audio 16000HZ
 
 import torch
 import numpy
-import time, pdb, argparse, subprocess, os, math, glob
+import time, pdb, argparse, subprocess, os, math, glob, logging
 import cv2
 import python_speech_features
 
@@ -13,11 +13,13 @@
 from SyncNetModel import *
 from shutil import rmtree
 
+logger = logging.getLogger(__name__)
+
 
 # ==================== Get OFFSET ====================
 
 def calc_pdist(feat1, feat2, vshift=10):
-    
+
     win_size = vshift*2+1
 
     feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
@@ -34,14 +36,16 @@ def calc_pdist(feat1, feat2, vshift=10):
 
 class SyncNetInstance(torch.nn.Module):
 
-    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
-        super(SyncNetInstance, self).__init__();
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024, device=None):
+        super().__init__()
 
-        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        logger.info('Using device: %s', self.device)
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).to(self.device)
 
     def evaluate(self, opt, videofile):
 
-        self.__S__.eval();
+        self.__S__.eval()
 
         # ========== ==========
         # Convert files
@@ -52,18 +56,21 @@ def evaluate(self, opt, videofile):
 
         os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
 
-        command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
-        output = subprocess.call(command, shell=True, stdout=None)
+        command = ["ffmpeg", "-y", "-i", videofile, "-threads", "1", "-f", "image2",
+                   os.path.join(opt.tmp_dir, opt.reference, '%06d.jpg')]
+        subprocess.run(command, check=True)
+
+        command = ["ffmpeg", "-y", "-i", videofile, "-async", "1", "-ac", "1", "-vn",
+                   "-acodec", "pcm_s16le", "-ar", "16000",
+                   os.path.join(opt.tmp_dir, opt.reference, 'audio.wav')]
+        subprocess.run(command, check=True)
 
-        command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
-        output = subprocess.call(command, shell=True, stdout=None)
-        
         # ========== ==========
-        # Load video 
+        # Load video
         # ========== ==========
 
         images = []
-        
+
         flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
         flist.sort()
 
@@ -74,7 +81,7 @@ def evaluate(self, opt, videofile):
         im = numpy.expand_dims(im,axis=0)
         im = numpy.transpose(im,(0,3,4,1,2))
 
-        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        imtv = torch.from_numpy(im.astype(float)).float()
 
         # ========== ==========
         # Load audio
@@ -85,17 +92,17 @@ def evaluate(self, opt, videofile):
         mfcc = numpy.stack([numpy.array(i) for i in mfcc])
 
         cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
-        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+        cct = torch.from_numpy(cc.astype(float)).float()
 
         # ========== ==========
         # Check audio and video input length
         # ========== ==========
 
         if (float(len(audio))/16000) != (float(len(images))/25) :
-            print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+            logger.warning("Audio (%.4fs) and video (%.4fs) lengths are different.",float(len(audio))/16000,float(len(images))/25)
 
         min_length = min(len(images),math.floor(len(audio)/640))
-        
+
         # ========== ==========
         # Generate video and audio feats
         # ========== ==========
@@ -106,15 +113,15 @@ def evaluate(self, opt, videofile):
 
         tS = time.time()
         for i in range(0,lastframe,opt.batch_size):
-            
+
             im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
             im_in = torch.cat(im_batch,0)
-            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_out  = self.__S__.forward_lip(im_in.to(self.device))
             im_feat.append(im_out.data.cpu())
 
             cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
             cc_in = torch.cat(cc_batch,0)
-            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_out  = self.__S__.forward_aud(cc_in.to(self.device))
             cc_feat.append(cc_out.data.cpu())
 
         im_feat = torch.cat(im_feat,0)
@@ -123,8 +130,8 @@ def evaluate(self, opt, videofile):
         # ========== ==========
         # Compute offset
         # ========== ==========
-            
-        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        logger.info('Compute time %.3f sec.', time.time()-tS)
 
         dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
         mdist = torch.mean(torch.stack(dists,1),1)
@@ -138,25 +145,27 @@ def evaluate(self, opt, videofile):
         # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
         fconf   = torch.median(mdist).numpy() - fdist
         fconfm  = signal.medfilt(fconf,kernel_size=9)
-        
+
         numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
-        print('Framewise conf: ')
-        print(fconfm)
-        print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+        logger.info('Framewise conf: ')
+        logger.info(fconfm)
+        logger.info('AV offset: \t%d', offset.item())
+        logger.info('Min dist: \t%.3f', minval.item())
+        logger.info('Confidence: \t%.3f', conf.item())
 
         dists_npy = numpy.array([ dist.numpy() for dist in dists ])
         return offset.numpy(), conf.numpy(), dists_npy
 
     def extract_feature(self, opt, videofile):
 
-        self.__S__.eval();
-        
+        self.__S__.eval()
+
         # ========== ==========
-        # Load video 
+        # Load video
         # ========== ==========
         cap = cv2.VideoCapture(videofile)
 
-        frame_num = 1;
+        frame_num = 1
         images = []
         while frame_num:
             frame_num += 1
@@ -170,8 +179,8 @@ def extract_feature(self, opt, videofile):
         im = numpy.expand_dims(im,axis=0)
         im = numpy.transpose(im,(0,3,4,1,2))
 
-        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
-        
+        imtv = torch.from_numpy(im.astype(float)).float()
+
         # ========== ==========
         # Generate video feats
         # ========== ==========
@@ -181,10 +190,10 @@ def extract_feature(self, opt, videofile):
 
         tS = time.time()
         for i in range(0,lastframe,opt.batch_size):
-            
+
             im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
             im_in = torch.cat(im_batch,0)
-            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_out  = self.__S__.forward_lipfeat(im_in.to(self.device))
             im_feat.append(im_out.data.cpu())
 
         im_feat = torch.cat(im_feat,0)
@@ -192,17 +201,17 @@ def extract_feature(self, opt, videofile):
         # ========== ==========
         # Compute offset
         # ========== ==========
-            
-        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        logger.info('Compute time %.3f sec.', time.time()-tS)
 
         return im_feat
 
 
     def loadParameters(self, path):
-        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage, weights_only=True)
 
-        self_state = self.__S__.state_dict();
+        self_state = self.__S__.state_dict()
 
         for name, param in loaded_state.items():
 
-            self_state[name].copy_(param);
+            self_state[name].copy_(param)
diff --git a/SyncNetModel.py b/SyncNetModel.py
index c21ce25..cf97caf 100755
--- a/SyncNetModel.py
+++ b/SyncNetModel.py
@@ -1,25 +1,16 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
 import torch
 import torch.nn as nn
 
-def save(model, filename):
-    with open(filename, "wb") as f:
-        torch.save(model, f);
-        print("%s saved."%filename);
-
-def load(filename):
-    net = torch.load(filename)
-    return net;
-    
 class S(nn.Module):
     def __init__(self, num_layers_in_fc_layers = 1024):
-        super(S, self).__init__();
+        super().__init__()
 
-        self.__nFeatures__ = 24;
-        self.__nChs__ = 32;
-        self.__midChs__ = 32;
+        self.__nFeatures__ = 24
+        self.__nChs__ = 32
+        self.__midChs__ = 32
 
         self.netcnnaud = nn.Sequential(
             nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
@@ -44,25 +35,25 @@ def __init__(self, num_layers_in_fc_layers = 1024):
             nn.BatchNorm2d(256),
             nn.ReLU(inplace=True),
             nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
-            
+
             nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)),
             nn.BatchNorm2d(512),
             nn.ReLU(),
-        );
+        )
 
         self.netfcaud = nn.Sequential(
             nn.Linear(512, 512),
             nn.BatchNorm1d(512),
             nn.ReLU(),
             nn.Linear(512, num_layers_in_fc_layers),
-        );
+        )
 
         self.netfclip = nn.Sequential(
             nn.Linear(512, 512),
             nn.BatchNorm1d(512),
             nn.ReLU(),
             nn.Linear(512, num_layers_in_fc_layers),
-        );
+        )
 
         self.netcnnlip = nn.Sequential(
             nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0),
@@ -91,27 +82,27 @@ def __init__(self, num_layers_in_fc_layers = 1024):
             nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0),
             nn.BatchNorm3d(512),
             nn.ReLU(inplace=True),
-        );
+        )
 
     def forward_aud(self, x):
 
-        mid = self.netcnnaud(x); # N x ch x 24 x M
-        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
-        out = self.netfcaud(mid);
+        mid = self.netcnnaud(x)  # N x ch x 24 x M
+        mid = mid.view((mid.size(0), -1))  # N x (ch x 24)
+        out = self.netfcaud(mid)
 
-        return out;
+        return out
 
     def forward_lip(self, x):
 
-        mid = self.netcnnlip(x); 
-        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
-        out = self.netfclip(mid);
+        mid = self.netcnnlip(x)
+        mid = mid.view((mid.size(0), -1))  # N x (ch x 24)
+        out = self.netfclip(mid)
 
-        return out;
+        return out
 
     def forward_lipfeat(self, x):
 
-        mid = self.netcnnlip(x);
-        out = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        mid = self.netcnnlip(x)
+        out = mid.view((mid.size(0), -1))  # N x (ch x 24)
 
-        return out;
\ No newline at end of file
+        return out
diff --git a/demo_feature.py b/demo_feature.py
index e3bd290..416acfc 100755
--- a/demo_feature.py
+++ b/demo_feature.py
@@ -1,31 +1,34 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
-import time, pdb, argparse, subprocess
+import time, pdb, argparse, subprocess, logging
 
 from SyncNetInstance import *
 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
 # ==================== LOAD PARAMS ====================
 
 
-parser = argparse.ArgumentParser(description = "SyncNet");
+parser = argparse.ArgumentParser(description = "SyncNet")
 
-parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
-parser.add_argument('--batch_size', type=int, default='20', help='');
-parser.add_argument('--vshift', type=int, default='15', help='');
-parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
-parser.add_argument('--tmp_dir', type=str, default="data", help='');
-parser.add_argument('--save_as', type=str, default="data/features.pt", help='');
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='')
+parser.add_argument('--batch_size', type=int, default='20', help='')
+parser.add_argument('--vshift', type=int, default='15', help='')
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='')
+parser.add_argument('--tmp_dir', type=str, default="data", help='')
+parser.add_argument('--save_as', type=str, default="data/features.pt", help='')
 
-opt = parser.parse_args();
+opt = parser.parse_args()
 
 
 # ==================== RUN EVALUATION ====================
 
-s = SyncNetInstance();
+s = SyncNetInstance()
 
-s.loadParameters(opt.initial_model);
-print("Model %s loaded."%opt.initial_model);
+s.loadParameters(opt.initial_model)
+logger.info("Model %s loaded.", opt.initial_model)
 
 feats = s.extract_feature(opt, videofile=opt.videofile)
 
diff --git a/demo_syncnet.py b/demo_syncnet.py
index 01c25a6..8826b0a 100755
--- a/demo_syncnet.py
+++ b/demo_syncnet.py
@@ -1,30 +1,33 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
-import time, pdb, argparse, subprocess
+import time, pdb, argparse, subprocess, logging
 
 from SyncNetInstance import *
 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
 # ==================== LOAD PARAMS ====================
 
 
-parser = argparse.ArgumentParser(description = "SyncNet");
+parser = argparse.ArgumentParser(description = "SyncNet")
 
-parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
-parser.add_argument('--batch_size', type=int, default='20', help='');
-parser.add_argument('--vshift', type=int, default='15', help='');
-parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
-parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
-parser.add_argument('--reference', type=str, default="demo", help='');
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='')
+parser.add_argument('--batch_size', type=int, default='20', help='')
+parser.add_argument('--vshift', type=int, default='15', help='')
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='')
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='')
+parser.add_argument('--reference', type=str, default="demo", help='')
 
-opt = parser.parse_args();
+opt = parser.parse_args()
 
 
 # ==================== RUN EVALUATION ====================
 
-s = SyncNetInstance();
+s = SyncNetInstance()
 
-s.loadParameters(opt.initial_model);
-print("Model %s loaded."%opt.initial_model);
+s.loadParameters(opt.initial_model)
+logger.info("Model %s loaded.", opt.initial_model)
 
 s.evaluate(opt, videofile=opt.videofile)
diff --git a/detectors/s3fd/__init__.py b/detectors/s3fd/__init__.py
index d7f35e0..3c61da7 100644
--- a/detectors/s3fd/__init__.py
+++ b/detectors/s3fd/__init__.py
@@ -1,11 +1,12 @@
-import time
+import time, logging
 import numpy as np
 import cv2
 import torch
-from torchvision import transforms
 from .nets import S3FDNet
 from .box_utils import nms_
 
+logger = logging.getLogger(__name__)
+
 PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
 img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
 
@@ -17,12 +18,12 @@ def __init__(self, device='cuda'):
         tstamp = time.time()
         self.device = device
 
-        print('[S3FD] loading with', self.device)
+        logger.info('[S3FD] loading with %s', self.device)
         self.net = S3FDNet(device=self.device).to(self.device)
-        state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
+        state_dict = torch.load(PATH_WEIGHT, map_location=self.device, weights_only=True)
         self.net.load_state_dict(state_dict)
         self.net.eval()
-        print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+        logger.info('[S3FD] finished loading (%.4f sec)', time.time() - tstamp)
     
     def detect_faces(self, image, conf_th=0.8, scales=[1]):
 
diff --git a/detectors/s3fd/box_utils.py b/detectors/s3fd/box_utils.py
index 0779bcd..00686a2 100644
--- a/detectors/s3fd/box_utils.py
+++ b/detectors/s3fd/box_utils.py
@@ -35,7 +35,7 @@ def nms_(dets, thresh):
         inds = np.where(ovr <= thresh)[0]
         order = order[inds + 1]
 
-    return np.array(keep).astype(np.int)
+    return np.array(keep).astype(np.intp)
 
 
 def decode(loc, priors, variances):
@@ -82,45 +82,32 @@ def nms(boxes, scores, overlap=0.5, top_k=200):
     v, idx = scores.sort(0)  # sort in ascending order
     # I = I[v >= 0.01]
     idx = idx[-top_k:]  # indices of the top-k largest vals
-    xx1 = boxes.new()
-    yy1 = boxes.new()
-    xx2 = boxes.new()
-    yy2 = boxes.new()
-    w = boxes.new()
-    h = boxes.new()
-
-    # keep = torch.Tensor()
+
     count = 0
     while idx.numel() > 0:
         i = idx[-1]  # index of current largest val
-        # keep.append(i)
         keep[count] = i
         count += 1
         if idx.size(0) == 1:
             break
         idx = idx[:-1]  # remove kept element from view
         # load bboxes of next highest vals
-        torch.index_select(x1, 0, idx, out=xx1)
-        torch.index_select(y1, 0, idx, out=yy1)
-        torch.index_select(x2, 0, idx, out=xx2)
-        torch.index_select(y2, 0, idx, out=yy2)
+        xx1 = torch.index_select(x1, 0, idx)
+        yy1 = torch.index_select(y1, 0, idx)
+        xx2 = torch.index_select(x2, 0, idx)
+        yy2 = torch.index_select(y2, 0, idx)
         # store element-wise max with next highest score
         xx1 = torch.clamp(xx1, min=x1[i])
         yy1 = torch.clamp(yy1, min=y1[i])
         xx2 = torch.clamp(xx2, max=x2[i])
         yy2 = torch.clamp(yy2, max=y2[i])
-        w.resize_as_(xx2)
-        h.resize_as_(yy2)
-        w = xx2 - xx1
-        h = yy2 - yy1
-        # check sizes of xx1 and xx2.. after each iteration
-        w = torch.clamp(w, min=0.0)
-        h = torch.clamp(h, min=0.0)
+        w = torch.clamp(xx2 - xx1, min=0.0)
+        h = torch.clamp(yy2 - yy1, min=0.0)
         inter = w * h
         # IoU = i / (area(a) + area(b) - i)
-        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        rem_areas = torch.index_select(area, 0, idx)
         union = (rem_areas - inter) + area[i]
-        IoU = inter / union  # store result in iou
+        IoU = inter / union
         # keep only elements with an IoU <= overlap
         idx = idx[IoU.le(overlap)]
     return keep, count
@@ -181,7 +168,7 @@ def __init__(self, input_size, feature_maps,
                     steps=[4, 8, 16, 32, 64, 128],
                     clip=False):
 
-        super(PriorBox, self).__init__()
+        super().__init__()
 
         self.imh = input_size[0]
         self.imw = input_size[1]
diff --git a/detectors/s3fd/nets.py b/detectors/s3fd/nets.py
index 85b5c82..937a73f 100644
--- a/detectors/s3fd/nets.py
+++ b/detectors/s3fd/nets.py
@@ -8,7 +8,7 @@
 class L2Norm(nn.Module):
 
     def __init__(self, n_channels, scale):
-        super(L2Norm, self).__init__()
+        super().__init__()
         self.n_channels = n_channels
         self.gamma = scale or None
         self.eps = 1e-10
@@ -28,7 +28,7 @@ def forward(self, x):
 class S3FDNet(nn.Module):
 
     def __init__(self, device='cuda'):
-        super(S3FDNet, self).__init__()
+        super().__init__()
         self.device = device
 
         self.vgg = nn.ModuleList([
diff --git a/download_model.sh b/download_model.sh
index 3e3a9dc..34895d9 100755
--- a/download_model.sh
+++ b/download_model.sh
@@ -1,9 +1,9 @@
 # SyncNet model
 
-mkdir data
+mkdir -p data
 wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model
 wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi
 
 # For the pre-processing pipeline
-mkdir detectors/s3fd/weights
+mkdir -p detectors/s3fd/weights
 wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth
\ No newline at end of file
diff --git a/environment-cpu.yml b/environment-cpu.yml
new file mode 100644
index 0000000..9dc3b99
--- /dev/null
+++ b/environment-cpu.yml
@@ -0,0 +1,27 @@
+name: syncnet
+channels:
+  - conda-forge
+  - pytorch
+  - defaults
+dependencies:
+  # Core Python and Math Libraries
+  - python=3.10
+  - numpy
+  - scipy
+  
+  # PyTorch Ecosystem
+  - pytorch::pytorch==2.5.1
+  - pytorch::torchvision==0.20.1
+  - pytorch::torchaudio==2.5.1
+  
+  # External Tools
+  - ffmpeg
+  
+  # Pip Installer
+  - pip
+  
+  # Pip-specific packages (Runs after Conda finishes)
+  - pip:
+    - scenedetect==0.6.7.1
+    - opencv-contrib-python==4.13.0.92
+    - python_speech_features==0.6
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..edd13be
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,29 @@
+name: syncnet
+channels:
+  - conda-forge
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  # Core Python and Math Libraries
+  - python=3.10
+  - numpy
+  - scipy
+  
+  # PyTorch Ecosystem
+  - pytorch::pytorch==2.5.1
+  - pytorch::torchvision==0.20.1
+  - pytorch::torchaudio==2.5.1
+  - pytorch::pytorch-cuda=12.4
+  
+  # External Tools
+  - ffmpeg
+  
+  # Pip Installer
+  - pip
+  
+  # Pip-specific packages (Runs after Conda finishes)
+  - pip:
+    - scenedetect==0.6.7.1
+    - opencv-contrib-python==4.13.0.92
+    - python_speech_features==0.6
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 8919740..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-torch>=1.4.0
-torchvision>=0.5.0
-numpy>=1.18.1
-scipy>=1.2.1
-scenedetect==0.5.1
-opencv-contrib-python
-python_speech_features
diff --git a/run_pipeline.py b/run_pipeline.py
index f5fc22e..50fee52 100755
--- a/run_pipeline.py
+++ b/run_pipeline.py
@@ -1,15 +1,14 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
-import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
+import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2, logging
 import numpy as np
+import torch
 from shutil import rmtree
 
-import scenedetect
-from scenedetect.video_manager import VideoManager
-from scenedetect.scene_manager import SceneManager
-from scenedetect.frame_timecode import FrameTimecode
-from scenedetect.stats_manager import StatsManager
-from scenedetect.detectors import ContentDetector
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+from scenedetect import open_video, SceneManager, ContentDetector
 
 from scipy.interpolate import interp1d
 from scipy.io import wavfile
@@ -21,17 +20,17 @@
 # # PARSE ARGS
 # ========== ========== ========== ==========
 
-parser = argparse.ArgumentParser(description = "FaceTracker");
-parser.add_argument('--data_dir',       type=str, default='data/work', help='Output direcotry');
-parser.add_argument('--videofile',      type=str, default='',   help='Input video file');
-parser.add_argument('--reference',      type=str, default='',   help='Video reference');
-parser.add_argument('--facedet_scale',  type=float, default=0.25, help='Scale factor for face detection');
-parser.add_argument('--crop_scale',     type=float, default=0.40, help='Scale bounding box');
-parser.add_argument('--min_track',      type=int, default=100,  help='Minimum facetrack duration');
-parser.add_argument('--frame_rate',     type=int, default=25,   help='Frame rate');
-parser.add_argument('--num_failed_det', type=int, default=25,   help='Number of missed detections allowed before tracking is stopped');
-parser.add_argument('--min_face_size',  type=int, default=100,  help='Minimum face size in pixels');
-opt = parser.parse_args();
+parser = argparse.ArgumentParser(description = "FaceTracker")
+parser.add_argument('--data_dir',       type=str, default='data/work', help='Output direcotry')
+parser.add_argument('--videofile',      type=str, default='',   help='Input video file')
+parser.add_argument('--reference',      type=str, default='',   help='Video reference')
+parser.add_argument('--facedet_scale',  type=float, default=0.25, help='Scale factor for face detection')
+parser.add_argument('--crop_scale',     type=float, default=0.40, help='Scale bounding box')
+parser.add_argument('--min_track',      type=int, default=100,  help='Minimum facetrack duration')
+parser.add_argument('--frame_rate',     type=int, default=25,   help='Frame rate')
+parser.add_argument('--num_failed_det', type=int, default=25,   help='Number of missed detections allowed before tracking is stopped')
+parser.add_argument('--min_face_size',  type=int, default=100,  help='Minimum face size in pixels')
+opt = parser.parse_args()
 
 setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
 setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
@@ -44,19 +43,19 @@
 # ========== ========== ========== ==========
 
 def bb_intersection_over_union(boxA, boxB):
-  
+
   xA = max(boxA[0], boxB[0])
   yA = max(boxA[1], boxB[1])
   xB = min(boxA[2], boxB[2])
   yB = min(boxA[3], boxB[3])
- 
+
   interArea = max(0, xB - xA) * max(0, yB - yA)
- 
+
   boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
   boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
- 
+
   iou = interArea / float(boxAArea + boxBArea - interArea)
- 
+
   return iou
 
 # ========== ========== ========== ==========
@@ -87,7 +86,7 @@ def track_shot(opt,scenefaces):
     if track == []:
       break
     elif len(track) > opt.min_track:
-      
+
       framenum    = np.array([ f['frame'] for f in track ])
       bboxes      = np.array([np.array(f['bbox']) for f in track])
 
@@ -107,7 +106,7 @@ def track_shot(opt,scenefaces):
 # ========== ========== ========== ==========
 # # VIDEO CROP AND SAVE
 # ========== ========== ========== ==========
-        
+
 def crop_video(opt,track,cropfile):
 
   flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
@@ -120,12 +119,12 @@ def crop_video(opt,track,cropfile):
 
   for det in track['bbox']:
 
-    dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) 
-    dets['y'].append((det[1]+det[3])/2) # crop center x 
+    dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2)
+    dets['y'].append((det[1]+det[3])/2) # crop center x
     dets['x'].append((det[0]+det[2])/2) # crop center y
 
   # Smooth detections
-  dets['s'] = signal.medfilt(dets['s'],kernel_size=13)   
+  dets['s'] = signal.medfilt(dets['s'],kernel_size=13)
   dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
   dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
 
@@ -134,16 +133,16 @@ def crop_video(opt,track,cropfile):
     cs  = opt.crop_scale
 
     bs  = dets['s'][fidx]   # Detection box size
-    bsi = int(bs*(1+2*cs))  # Pad videos by this amount 
+    bsi = int(bs*(1+2*cs))  # Pad videos by this amount
 
     image = cv2.imread(flist[frame])
-    
+
     frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
     my  = dets['y'][fidx]+bsi  # BBox center Y
     mx  = dets['x'][fidx]+bsi  # BBox center X
 
     face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
-    
+
     vOut.write(cv2.resize(face,(224,224)))
 
   audiotmp    = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
@@ -154,27 +153,25 @@ def crop_video(opt,track,cropfile):
 
   # ========== CROP AUDIO FILE ==========
 
-  command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) 
-  output = subprocess.call(command, shell=True, stdout=None)
-
-  if output != 0:
-    pdb.set_trace()
+  command = ["ffmpeg", "-y", "-i",
+             os.path.join(opt.avi_dir, opt.reference, 'audio.wav'),
+             "-ss", "%.3f" % audiostart, "-to", "%.3f" % audioend,
+             audiotmp]
+  subprocess.run(command, check=True)
 
   sample_rate, audio = wavfile.read(audiotmp)
 
   # ========== COMBINE AUDIO AND VIDEO FILES ==========
 
-  command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
-  output = subprocess.call(command, shell=True, stdout=None)
-
-  if output != 0:
-    pdb.set_trace()
+  command = ["ffmpeg", "-y", "-i", cropfile+'t.avi', "-i", audiotmp,
+             "-c:v", "copy", "-c:a", "copy", cropfile+'.avi']
+  subprocess.run(command, check=True)
 
-  print('Written %s'%cropfile)
+  logger.info('Written %s', cropfile)
 
   os.remove(cropfile+'t.avi')
 
-  print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
+  logger.info('Mean pos: x %.2f y %.2f s %.2f', np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))
 
   return {'track':track, 'proc_track':dets}
 
@@ -184,29 +181,30 @@ def crop_video(opt,track,cropfile):
 
 def inference_video(opt):
 
-  DET = S3FD(device='cuda')
+  device = 'cuda' if torch.cuda.is_available() else 'cpu'
+  DET = S3FD(device=device)
 
   flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
   flist.sort()
 
   dets = []
-      
+
   for fidx, fname in enumerate(flist):
 
     start_time = time.time()
-    
+
     image = cv2.imread(fname)
 
     image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
 
-    dets.append([]);
+    dets.append([])
     for bbox in bboxes:
       dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
 
     elapsed_time = time.time() - start_time
 
-    print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) 
+    logger.info('%s-%05d; %d dets; %.2f Hz', os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))
 
   savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
 
@@ -221,33 +219,27 @@ def inference_video(opt):
 
 def scene_detect(opt):
 
-  video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
-  stats_manager = StatsManager()
-  scene_manager = SceneManager(stats_manager)
-  # Add ContentDetector algorithm (constructor takes detector options like threshold).
-  scene_manager.add_detector(ContentDetector())
-  base_timecode = video_manager.get_base_timecode()
-
-  video_manager.set_downscale_factor()
-
-  video_manager.start()
+  video_path = os.path.join(opt.avi_dir,opt.reference,'video.avi')
+  video = open_video(video_path)
 
-  scene_manager.detect_scenes(frame_source=video_manager)
+  scene_manager = SceneManager()
+  scene_manager.add_detector(ContentDetector())
+  scene_manager.detect_scenes(video)
 
-  scene_list = scene_manager.get_scene_list(base_timecode)
+  scene_list = scene_manager.get_scene_list()
 
   savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
 
-  if scene_list == []:
-    scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
+  if not scene_list:
+    scene_list = [(video.base_timecode, video.base_timecode + video.duration)]
 
   with open(savepath, 'wb') as fil:
     pickle.dump(scene_list, fil)
 
-  print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
+  logger.info('%s - scenes detected %d', video_path, len(scene_list))
 
   return scene_list
-    
+
 
 # ========== ========== ========== ==========
 # # EXECUTE DEMO
@@ -255,39 +247,31 @@ def scene_detect(opt):
 
 # ========== DELETE EXISTING DIRECTORIES ==========
 
-if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
-  rmtree(os.path.join(opt.work_dir,opt.reference))
-
-if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
-  rmtree(os.path.join(opt.crop_dir,opt.reference))
-
-if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
-  rmtree(os.path.join(opt.avi_dir,opt.reference))
-
-if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
-  rmtree(os.path.join(opt.frames_dir,opt.reference))
-
-if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
-  rmtree(os.path.join(opt.tmp_dir,opt.reference))
+for d in [opt.work_dir, opt.crop_dir, opt.avi_dir, opt.frames_dir, opt.tmp_dir]:
+  path = os.path.join(d, opt.reference)
+  if os.path.exists(path):
+    rmtree(path)
 
 # ========== MAKE NEW DIRECTORIES ==========
 
-os.makedirs(os.path.join(opt.work_dir,opt.reference))
-os.makedirs(os.path.join(opt.crop_dir,opt.reference))
-os.makedirs(os.path.join(opt.avi_dir,opt.reference))
-os.makedirs(os.path.join(opt.frames_dir,opt.reference))
-os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+for d in [opt.work_dir, opt.crop_dir, opt.avi_dir, opt.frames_dir, opt.tmp_dir]:
+  os.makedirs(os.path.join(d, opt.reference), exist_ok=True)
 
 # ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
 
-command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
-output = subprocess.call(command, shell=True, stdout=None)
+command = ["ffmpeg", "-y", "-i", opt.videofile, "-qscale:v", "2", "-async", "1", "-r", "25",
+           os.path.join(opt.avi_dir, opt.reference, 'video.avi')]
+subprocess.run(command, check=True)
 
-command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) 
-output = subprocess.call(command, shell=True, stdout=None)
+command = ["ffmpeg", "-y", "-i", os.path.join(opt.avi_dir, opt.reference, 'video.avi'),
+           "-qscale:v", "2", "-threads", "1", "-f", "image2",
+           os.path.join(opt.frames_dir, opt.reference, '%06d.jpg')]
+subprocess.run(command, check=True)
 
-command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) 
-output = subprocess.call(command, shell=True, stdout=None)
+command = ["ffmpeg", "-y", "-i", os.path.join(opt.avi_dir, opt.reference, 'video.avi'),
+           "-ac", "1", "-vn", "-acodec", "pcm_s16le", "-ar", "16000",
+           os.path.join(opt.avi_dir, opt.reference, 'audio.wav')]
+subprocess.run(command, check=True)
 
 # ========== FACE DETECTION ==========
 
@@ -304,8 +288,8 @@ def scene_detect(opt):
 
 for shot in scene:
 
-  if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
-    alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
+  if shot[1].get_frames() - shot[0].get_frames() >= opt.min_track :
+    alltracks.extend(track_shot(opt,faces[shot[0].get_frames():shot[1].get_frames()]))
 
 # ========== FACE TRACK CROP ==========
 
diff --git a/run_syncnet.py b/run_syncnet.py
index 45099fd..dc9c2c0 100755
--- a/run_syncnet.py
+++ b/run_syncnet.py
@@ -1,20 +1,23 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
-import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob, logging
 
 from SyncNetInstance import *
 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
 # ==================== PARSE ARGUMENT ====================
 
-parser = argparse.ArgumentParser(description = "SyncNet");
-parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
-parser.add_argument('--batch_size', type=int, default='20', help='');
-parser.add_argument('--vshift', type=int, default='15', help='');
-parser.add_argument('--data_dir', type=str, default='data/work', help='');
-parser.add_argument('--videofile', type=str, default='', help='');
-parser.add_argument('--reference', type=str, default='', help='');
-opt = parser.parse_args();
+parser = argparse.ArgumentParser(description = "SyncNet")
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='')
+parser.add_argument('--batch_size', type=int, default='20', help='')
+parser.add_argument('--vshift', type=int, default='15', help='')
+parser.add_argument('--data_dir', type=str, default='data/work', help='')
+parser.add_argument('--videofile', type=str, default='', help='')
+parser.add_argument('--reference', type=str, default='', help='')
+opt = parser.parse_args()
 
 setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
 setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
@@ -24,10 +27,10 @@
 
 # ==================== LOAD MODEL AND FILE LIST ====================
 
-s = SyncNetInstance();
+s = SyncNetInstance()
 
-s.loadParameters(opt.initial_model);
-print("Model %s loaded."%opt.initial_model);
+s.loadParameters(opt.initial_model)
+logger.info("Model %s loaded.", opt.initial_model)
 
 flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
 flist.sort()
@@ -38,7 +41,7 @@
 for idx, fname in enumerate(flist):
     offset, conf, dist = s.evaluate(opt,videofile=fname)
     dists.append(dist)
-      
+
 # ==================== PRINT RESULTS TO FILE ====================
 
 with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
diff --git a/run_visualise.py b/run_visualise.py
index 85d8925..6148963 100644
--- a/run_visualise.py
+++ b/run_visualise.py
@@ -1,21 +1,24 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
 import torch
 import numpy
-import time, pdb, argparse, subprocess, pickle, os, glob
+import time, pdb, argparse, subprocess, pickle, os, glob, logging
 import cv2
 
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
 from scipy import signal
 
 # ==================== PARSE ARGUMENT ====================
 
-parser = argparse.ArgumentParser(description = "SyncNet");
-parser.add_argument('--data_dir', 	type=str, default='data/work', help='');
-parser.add_argument('--videofile', 	type=str, default='', help='');
-parser.add_argument('--reference', 	type=str, default='', help='');
-parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
-opt = parser.parse_args();
+parser = argparse.ArgumentParser(description = "SyncNet")
+parser.add_argument('--data_dir', 	type=str, default='data/work', help='')
+parser.add_argument('--videofile', 	type=str, default='', help='')
+parser.add_argument('--reference', 	type=str, default='', help='')
+parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate')
+opt = parser.parse_args()
 
 setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
 setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
@@ -42,8 +45,8 @@
 
 	mean_dists 	=  numpy.mean(numpy.stack(dists[tidx],1),1)
 	minidx 		= numpy.argmin(mean_dists,0)
-	minval 		= mean_dists[minidx] 
-	
+	minval 		= mean_dists[minidx]
+
 	fdist   	= numpy.stack([dist[minidx] for dist in dists[tidx]])
 	fdist   	= numpy.pad(fdist, (3,3), 'constant', constant_values=10)
 
@@ -69,20 +72,22 @@
 
 	for face in faces[fidx]:
 
-		clr = max(min(face['conf']*25,255),0)
+		clr = int(max(min(face['conf']*25,255),0))
 
 		cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3)
 		cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2)
 
 	vOut.write(image)
 
-	print('Frame %d'%fidx)
+	logger.info('Frame %d', fidx)
 
 vOut.release()
 
 # ========== COMBINE AUDIO AND VIDEO FILES ==========
 
-command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 
-output = subprocess.call(command, shell=True, stdout=None)
-
-
+command = ["ffmpeg", "-y", "-i",
+           os.path.join(opt.avi_dir, opt.reference, 'video_only.avi'),
+           "-i", os.path.join(opt.avi_dir, opt.reference, 'audio.wav'),
+           "-c:v", "copy", "-c:a", "copy",
+           os.path.join(opt.avi_dir, opt.reference, 'video_out.avi')]
+subprocess.run(command, check=True)