vtlim · magee256 · Jul 11, 2017 · vtlim · Jul 12, 2017 · magee256
diff --git a/matchMinima.py b/matchMinima.py
@@ -19,6 +19,8 @@
 
 
 
+#M Do what OE does and define your own molecule objects, they could even 
+#M extend the OE chem definitions to save you work. 
 def compare2Mols(rmol, qmol):
     """
     For two identical molecules, with varying conformers,
@@ -117,6 +119,7 @@ def plotMolMinima(molName, minimaE, xticklabels, selected=None,stag=False):
     plt.yticks(fontsize=12)
 
     ### Plot the data.
+    #M Didn't know about that
     colors = mpl.cm.rainbow(np.linspace(0, 1, numFiles))
     markers = ["x","^","8","d","o","s","*","p","v","<","D","+",">","."]*10
     #for i in reversed(range(numFiles)):
@@ -147,6 +150,11 @@ def plotAvgTimes(molName, avgTimes, sdTimes, xticklabels):
     figname = "timebars_%s.png" % molName
     x = range(len(avgTimes))
 
+    #M Hardcoding values is poor style, consider defining:
+    #M TITLE_SIZE = 20, Y_SIZE = 18, ...
+    #M makes it easier to change when everything is in one place
+    #M If you only use the values once I don't think hard code is 
+    #M terrible though.
     ### Label figure. Label xticks before plot for better spacing.
     plt.title(plttitle,fontsize=20)
     plt.ylabel(ylabel,fontsize=18)
@@ -161,6 +169,8 @@ def plotAvgTimes(molName, avgTimes, sdTimes, xticklabels):
     plt.clf()
 
 def plotHeatRMSE(molName, rmsArray, ticklabels,ptitle='RMS error (kcal/mol)',fprefix='rmse'):
+    #M Makes a heat map of root mean square error in ..., add a brief summary
+    #M all plotting functions
     plttitle="%s\n%s" % (ptitle,molName)
     figname = "%s_%s.png" % (fprefix, molName)
     x = range(len(rmsArray))
@@ -172,6 +182,7 @@ def plotHeatRMSE(molName, rmsArray, ticklabels,ptitle='RMS error (kcal/mol)',fpr
     plt.imshow(np.asarray(rmsArray).T, cmap='jet', origin='lower')
     plt.colorbar()
 
+    #M Again, limit hardcoding
     ### Label figure. Label xticks before plot for better spacing.
     plt.title(plttitle,fontsize=20)
     plt.xticks(x,ticklabels,fontsize=12,rotation=-20, ha='left')
@@ -265,14 +276,15 @@ def loadFile(fname):
 
     sdfRef = sdfList[0]
     numFiles = len(sdfList)
+    #M might be cleaner to have a list of molecule objects with each holding an
+    #M elist, tlist, indices, num conformers, name, etc. - Makes grouping clearer
     allIndices = [] # for M mols, N reference minima of each mol, P matching indices for each ref minimia
     elists = [] # 2D list: K mols per file x J numFiles
     tlists = [] # 2D list: K mols per file x J numFiles
     refNumConfs = [] # number of conformers for each mol in reference file
     molNames = [] # name of each molecule. for plotting.
 
-    for i, sdfQuery in enumerate(sdfList):
-        qthry = thryList[i]
+    for sdfQuery, qthry in zip(sdfList,thryList):
         qmethod = qthry.split('/')[0].strip()
         qbasis = qthry.split('/')[1].strip()
 
@@ -326,56 +338,7 @@ def loadFile(fname):
 
     return molNames, refNumConfs, allIndices, elists, tlists
 
-def getAllTimes(sdfList, thryList):
-    """
-    Get times saved in SD tages from files listed in python input file lines. 
-       No longer used after edits to matchMinima (07/1/2017).
-
-    Parameters
-    ----------
-    sdfList: str list - list of the SDF file names to be analyzed.
-          This list should include reference SDF file (sdfRef) as first element.
-    thryList: str list - list of levels of theory corresponding to the files
-          in sdfList. E.g., ['MP2/def2-TZVP','B3LYP-D3MBJ/6-311++G**']
-
-    Returns
-    -------
-    timelists: 3D list where timelists[i][j][k] is the wall time for optimizing
-       for the ith level of theory, jth molecule, kth conformer
-
-    """
-
-    sdfRef = sdfList[0]
-    timelists = []
-    for i, sdfQuery in enumerate(sdfList):
-        qthry = thryList[i]
-        qmethod = qthry.split('/')[0].strip()
-        qbasis = qthry.split('/')[1].strip()
-
-        print("\n\nOpening reference file %s" % sdfRef)
-        ifsRef = oechem.oemolistream()
-        ifsRef.SetConfTest( oechem.OEAbsoluteConfTest() )
-        if not ifsRef.open(sdfRef):
-            oechem.OEThrow.Fatal("Unable to open %s for reading" % sdfRef)
-        molsRef = ifsRef.GetOEMols()
-
-        print("Opening query file %s, and using [ %s ] wall times" % (sdfQuery, qthry))
-        ifsQuery = oechem.oemolistream()
-        ifsQuery.SetConfTest( oechem.OEAbsoluteConfTest() )
-        if not ifsQuery.open(sdfQuery):
-            oechem.OEThrow.Fatal("Unable to open %s for reading" % sdfQuery)
-        molsQuery = ifsQuery.GetOEMols()
-
-        for rmol in molsRef:
-            try:
-                qmol = molsQuery.next()
-                timelists.append(map(float, pt.GetSDList(qmol, "opt runtime",'Psi4', qmethod, qbasis))) # for opt, not spe
-            except StopIteration:
-                print("No %s molecule found in %s" % (rmol.GetTitle(), sdfQuery))
-                timelists.append([nan]*rmol.NumConfs())
-                continue
-
-    return timelists
+#M If it's no longer used delete it. Git will remember
 
 def calcRMSError(trimE, zeroes):
     """
@@ -432,13 +395,15 @@ def getRatioTimes(allMolTimes, zeroes):
 
     """
 
+#M Unrelated: You're compring times spent in each minima? What for?
     relByFile = []
     sdByFile = []
     for i, molist in enumerate(allMolTimes):
         molTimes = []
         molStds = []
         for j, filelist in enumerate(molist):
             rels = np.asarray(filelist)/np.asarray(molist[0])
+            #M I trust this is safe?
             rels = rels[~np.isnan(rels)] # delete nan values to get avg********
             avg = np.mean(rels)
             sd = np.std(rels)
@@ -482,46 +447,15 @@ def calcRelEne(minimaE):
         print i, nanCnt
         zeroes.append(nanCnt.index(min(nanCnt)))
 
-#        zero = 0  # initial guess for this molecule's reference among all files
-#        zpass = False  # the zero is valid if all files have it and not nan
-#
-#        while not zpass:
-#            try: # get enes at index zero
-#                print molist # infinite loop?!??!
-#                zeroths = [item[zero] if sum(np.isnan(item)) < len(molist[0]) else '' for item in molist]
-#            except IndexError: # error if zero index is too far
-#                pass
-#            if len(molist[0]) == 1: # no rel. ene for just one conf
-#                mols2del.append(i)
-#                print "ONLY ONE CONF FOUND FOR MOL ",i
-#                break
-##            elif zero >= len(molist[0]): # all values are nan
-##                mols2del.append(i)
-##                print "ALL NANS FOR 1+ FILE OF MOL ",i
-##                zero = 0
-##                break
-#            elif nan in zeroths: # missing ref for 1+ file(s)
-#                zero = zero + 1
-#                continue
-#            else: # found successful conf in all files
-#                zeroes.append(zero)
-#                zpass = True
-#
-#    # delete cases with just one conformer
-#    print("ATTN: these (zero-indexed) mols were removed from analysis due to "
-#         +"single conformer or no conformer matches in at least one file: ",
-#         mols2del)
-#    trimE = np.delete(np.asarray(minimaE),mols2del,axis=0)
-#    if len(trimE) != len(zeroes):
-#        print len(trimE), zeroes
-#        sys.exit("Error in determining reference confs for molecules.")
+#M Delete unused code unless you expect to add it back in soon. Reduces clutter
 
     # calc relative energies, and convert Hartrees to kcal/mol.
     mintemp = []  # not sure why this is needed but writeRelEne kicks fuss without it
     for z, molE in zip(zeroes, minimaE):
     #for z, molE in zip(zeroes, trimE):
         temp = [] # temp list for this mol's relative energies
         for fileE in molE:
+            #M Define hart_to_kcal_mol = 627.5095 for clarity
             temp.append([627.5095*(fileE[i]-fileE[z]) for i in range(len(fileE))])
         mintemp.append(temp)
     trimE = mintemp
@@ -568,6 +502,7 @@ def reorganizeSublists(theArray,allMolIndices):
     """
     Instead of grouping by files then by molecule,
         reorder to group by molecules then by file.
+        #M I think you meant fileN in first line? 
     Something like this:
        [[[file1 mol1] [file1 mol2]] ... [[file2 mol1] [file2 mol2]]]
     Goes to this:
@@ -580,14 +515,20 @@ def reorganizeSublists(theArray,allMolIndices):
     minimaE = []
     for i, molIndices in enumerate(allMolIndices):
         molE = [] # all conf energies from ith mol in all files
+        #M Try this:
+        #M      flipped = np.array(theArray[i]).T
+        #M      molE.append([ nan if (x == None or x == -2) else theArray[i][j][k] for ... ])
         for j, fileIndices in enumerate(molIndices):
             fileE = []  # all conf energies from ith mol in jth file
             for k, confNum in enumerate(fileIndices):
+                #M Separate reordering from filling NaNs, easier to follow
                 # None: means no conf in qmol within 0.5 Angs of rmol's conf
                 # -2: means that the conformer doesn't exist (if filtered
                 #   out from job not finishing, etc.)
                 if confNum == None or confNum==-2:
+                    #M Useful to keep the two failure modes separate?
                     fileE.append(nan)
+                #M do you need this?
                 elif confNum == -1:   # -1 signifies reference theory
                     fileE.append(float(theArray[i][j][k]))
                 else:
@@ -596,6 +537,10 @@ def reorganizeSublists(theArray,allMolIndices):
         minimaE.append(molE)
     return minimaE
 
+#M I take it there's no better way to do this?
+#M You should take a look at: https://docs.python.org/2/library/unittest.html
+#M I haven't used it myself but I really should. Also look into using assert
+#M statements
 def debugging():
     molNames = ['AlkEthOH_c1178','GBI']
     refNumConfs = [19, 1]
@@ -656,6 +601,8 @@ def debugging():
                except the files should point to the relative energies .dat file\
                generated as result of minimaMatching.")
 
+    #M I suppose vars(args) is fine, but might be better
+    #M to use args.input, args.readpickle
     args = parser.parse_args()
     opt = vars(args)
     if not os.path.exists(opt['input']):
@@ -676,9 +623,18 @@ def debugging():
             thryList.append(dataline[0])
             sdfList.append(dataline[1])
 
+    #M Don't do so much in __main__, try defining more, smaller functions. 
+    #M Leaving __main__ for high level logic makes code more readable
 
     # =========================================================================
+    #M I expect there's the opportunity to define a class here. 
+    #M Maybe have each molecule with its energies, atoms and configurations be a 
+    #M class and define member functions that do stuff like matching minima. 
+    #M Might make things cleaner. 
+
+    #M def find_mol_features(opt.readpickle,sdfList,thryList):
     if not opt['readpickle']:
+        #M Pass zip(sdfList,thryList) instead, ensures they're the same length
         molNames, refNumConfs, allIndices, elists, tlists = matchMinima(sdfList, thryList)
         pickle.dump([molNames, refNumConfs,allIndices,elists,tlists], open('match.pickle', 'wb'))
     else:
@@ -690,6 +646,7 @@ def debugging():
     #   [[[file1 mol1] [file2 mol1]] ... [[file1 molN] [file2 molN]]]
     #   also now the molecules are separated by sublist
     # could be done in matchMinima function but need elists for plotting
+    #M def reorder_indices_and_energies():
     numMols = len(refNumConfs)
     allMolIndices = [allIndices[i::numMols] for i in range(numMols)]
     elists = [elists[i::numMols] for i in range(numMols)]
@@ -702,6 +659,8 @@ def debugging():
     # =========================================================================
 
     if opt['verbose']:
+        #M def dump_energies(): Could be a member function for each molecule if
+        #M you made them a class
         for i, mn in enumerate(molNames):
             try:
                 writeRelEne(mn, rmselist[i],trimE[i],zeroes[i],thryList)
@@ -711,18 +670,25 @@ def debugging():
 
 
     if opt['tplot']:
+        #M def tplot():
 #        allMolTimes = getAllTimes(sdfList, thryList) # ordered by file, mol, conf
         allMolTimes = tlists
 
         # match conformer times using indices from matchMinima then get stats
+        #M What are minima times? Number of matching conformers? 
         allFileTimes = [[] for i in range(numMols)]
         allFileStds = [[] for i in range(numMols)]
+        #M Could iterate over molecules and use an offset
         for i in range(len(sdfList)*numMols):
             timeSuc = 0     # running sum of successfully matched minima times
             numSuc = 0      # running number of successful minima
             numconfs = refNumConfs[i%numMols]  # i%numMols gets some mol
             fileTimes = []  # collect successful times for stdevs
             for k in range(numconfs):
+                #M Might want to modify allIndices so instead of storing 
+                #M -1 you just store k, would allow more compact code
+                #M in multiple places. If you need to keep track of -1
+                #M val locations consider an array of True, False vals instead. 
                 thisIndex = allIndices[i][k]
                 if thisIndex == -1:
                     timeSuc += allMolTimes[i][k]
@@ -740,6 +706,7 @@ def debugging():
             allFileStds[i%numMols].append(np.std(np.array(fileTimes)))
             #print 'average ',fTimeAvg,' over ',numSuc," samples"
 
+        #M def calc_relative_speeds():
         # separately, go from allMolTimes and calculate relative speeds
         allMolTimes = [allMolTimes[i::numMols] for i in range(numMols)]
         timesByMol = reorganizeSublists(allMolTimes, allMolIndices)
@@ -753,6 +720,7 @@ def debugging():
 #            plotAvgTimes(name, fileTimes[1:], stdevs[1:], thryList[1:]) # ================== * REMOVE last element from all
 
         if opt['verbose']: # append time to relative energies file
+            #M def write_times_to_rel_ene():
             for i, name in enumerate(molNames):
 #            for name, fileTimes, stdevs in zip(molNames, allFileTimes, allFileStds, relTimes, sdTimes):
                 compF = open('relene_'+name+'.dat','a')
@@ -775,11 +743,13 @@ def debugging():
 
 
     if opt['eplot']:
+        #M def ... (Just for consistency)
         for name, minE in zip(molNames, trimE):
             plotMolMinima(name, minE, thryList)
             #plotMolMinima(name, minE, thryList, selected=[0,7,12]) # zero based index
 
     if opt['eheatplot'] is not None:
+        #M def ...
         rmsArray = []
         for infile in sdfList:
             with open(infile) as f:
@@ -797,6 +767,9 @@ def debugging():
     if opt['theatplot'] is not None:
         rmsArray = []
         for infile in sdfList:
+            #M This construct is called a context manager, they're useful
+            #M elsewhere to. I read this about them:
+            #M https://jeffknupp.com/blog/2016/03/07/python-with-context-managers/
             with open(infile) as f:
                 for line in f:
                     if "avg time" in line:
@@ -827,6 +800,8 @@ def debugging():
                         ravgs = [float(s) for s in j.split()[1:]]
                         tArray.append(ravgs)
                         break
+        #M Why are you doing this? It seems awkward, can you think of better
+        #M way?
         eArray = shiftArray(eArray)
         tArray = shiftArray(tArray)
         eArray = [item[:-1] for item in eArray] # ================== * REMOVE last element from all