From 120cd9296c0f1ed47495411e06907e7f8f30c3bf Mon Sep 17 00:00:00 2001
From: MehmetTopsakal <metokal@gmail.com>
Date: Wed, 26 Sep 2018 12:32:00 -0400
Subject: [PATCH 1/3] add find_sample_from_2dscan

---
 xpdtools/tools.py | 94 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/xpdtools/tools.py b/xpdtools/tools.py
index 7f8a6f5..d0422fc 100644
--- a/xpdtools/tools.py
+++ b/xpdtools/tools.py
@@ -581,3 +581,97 @@ def inner(x, *args, **kwargs):
         return func(*args, **kwargs)
 
     return inner
+
+
+
+
+def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, params=None):
+    """Find sample positions from xy-scan 
+    
+    Parameters
+    ----------
+    xy_arr : x,y of scan points
+    I_arr : Intensities for each scan point
+    Q_arr : Q-points (optional)
+    
+    params : (optional)
+    Parameters for DBSCAN clustering and point selection.
+    See DBSCAN documentation:
+    http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
+    - eps: The maximum distance between two samples for them to 
+      be considered as in the same neighborhood.
+    - min_samples: The number of samples (or total weight) 
+      in a neighborhood for a point to be considered as a core point. 
+      This includes the point itself.
+    - n_jobs: The number of parallel jobs to run. None means 1 
+      unless in a joblib.parallel_backend context. 
+      -1 means using all processors.  
+      s_ratio: Ratio of sample points to total points. If we have
+    100 dx,dy points and sample is expected to be found in 20 of
+    these points, then the remaining 80 should be background.
+    So, s_ratio is 0.2. Default value of 0.5 works fine.
+
+    use_unclassified: Sometimes DBSCAN is unable to classify 
+    points around sample boundary. It that case, it gives -1.
+    If this keyword is True, that point is considered in the 
+    sample positions (pts).
+
+    
+    Returns
+    -------
+    center : ndarray
+        xy coordinates of the center of the sample
+    pts : ndarray
+        xy coordinates of points considered within the sample
+    
+    """
+    
+    from sklearn.cluster import DBSCAN
+    from scipy.stats import spearmanr
+    
+    if params is None:
+        print('params is not provided. Using default parameters')
+        params = {'eps':0.05,'min_samples':20,'n_jobs':1,
+                  's_ratio':0.5,'qrange':(1,5),
+                 'use_unclassified':True}        
+    
+    if isinstance(Q_arr,np.ndarray):
+        if params['qrange']:
+            # Trim to selected Q range. Because we do not want mess around 
+            # sample holder and detector edge. This also speedups DBSCAN calculation
+            sel = (Q_arr > params['qrange'][0]) & (Q_arr < params['qrange'][1])            
+            I_arr = np.array([i[sel] for i in I_arr])
+    else:
+        print('Q array is not provided. Using whole points')
+         
+    # Use DBSCAN package to cluster I_arr
+    dbs   = DBSCAN(params['eps'], min_samples=params['min_samples'],
+                metric=lambda i, j: 1 - spearmanr(i, j)[0], n_jobs=params['n_jobs'])
+    preds = dbs.fit_predict(np.array(I_arr))
+    uniques, counts = np.unique(preds, return_counts=True)
+    ratios = counts / sum(counts)
+
+    # Collect x,y data for determining points which should correspond to the sample.
+    pts = []   
+
+    for j,u in enumerate(uniques):
+        mask = (preds == u)        
+        masked = []
+        for i,tf in enumerate(mask):
+            if tf:
+                masked.append(xy_arr[i])
+        # these are not classified
+        if u == -1:
+            if params['use_unclassified']:
+                pts.extend(masked)
+        else:
+            # these are possibly background
+            if not (ratios[j] >= params['s_ratio']):
+                pts.extend(masked)        
+    pts = np.array(pts)
+       
+    center = np.mean(pts, axis=0) 
+
+    # TODO: Get rid of s_ratio
+    
+    return center, pts

From a857f5fef4b674baf888871fe49c4a3a85fe5e29 Mon Sep 17 00:00:00 2001
From: MehmetTopsakal <metokal@gmail.com>
Date: Wed, 26 Sep 2018 14:53:43 -0400
Subject: [PATCH 2/3] update find_sample_from_2dscan

---
 xpdtools/tools.py | 59 ++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/xpdtools/tools.py b/xpdtools/tools.py
index d0422fc..ab7f8cc 100644
--- a/xpdtools/tools.py
+++ b/xpdtools/tools.py
@@ -24,6 +24,9 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from functools import wraps
 
+from sklearn.cluster import DBSCAN
+from scipy.stats import spearmanr
+
 try:
     from diffpy.pdfgetx import PDFGetter
 except ImportError:
@@ -584,8 +587,9 @@ def inner(x, *args, **kwargs):
 
 
 
-
-def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, params=None):
+def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, 
+        eps=0.05, min_samples=20, n_jobs=1,
+        b_ratio_thres=0.5, qrange=(1,5), use_unclassified=True):
     """Find sample positions from xy-scan 
     
     Parameters
@@ -594,22 +598,22 @@ def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, params=None):
     I_arr : Intensities for each scan point
     Q_arr : Q-points (optional)
     
-    params : (optional)
-    Parameters for DBSCAN clustering and point selection.
     See DBSCAN documentation:
     http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
-    - eps: The maximum distance between two samples for them to 
+    
+    eps: The maximum distance between two samples for them to 
       be considered as in the same neighborhood.
-    - min_samples: The number of samples (or total weight) 
+      
+    min_samples: The number of samples (or total weight) 
       in a neighborhood for a point to be considered as a core point. 
       This includes the point itself.
-    - n_jobs: The number of parallel jobs to run. None means 1 
+      
+    n_jobs: The number of parallel jobs to run. None means 1 
       unless in a joblib.parallel_backend context. 
       -1 means using all processors.  
-      s_ratio: Ratio of sample points to total points. If we have
-    100 dx,dy points and sample is expected to be found in 20 of
-    these points, then the remaining 80 should be background.
-    So, s_ratio is 0.2. Default value of 0.5 works fine.
+    
+    b_ratio_thres: Clusters more than this threshold will be 
+    condidered as background (not belonging to sample).
 
     use_unclassified: Sometimes DBSCAN is unable to classify 
     points around sample boundary. It that case, it gives -1.
@@ -626,27 +630,18 @@ def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, params=None):
     
     """
     
-    from sklearn.cluster import DBSCAN
-    from scipy.stats import spearmanr
-    
-    if params is None:
-        print('params is not provided. Using default parameters')
-        params = {'eps':0.05,'min_samples':20,'n_jobs':1,
-                  's_ratio':0.5,'qrange':(1,5),
-                 'use_unclassified':True}        
-    
     if isinstance(Q_arr,np.ndarray):
-        if params['qrange']:
-            # Trim to selected Q range. Because we do not want mess around 
-            # sample holder and detector edge. This also speedups DBSCAN calculation
-            sel = (Q_arr > params['qrange'][0]) & (Q_arr < params['qrange'][1])            
-            I_arr = np.array([i[sel] for i in I_arr])
+        # Trim to selected Q range. Because we do not want mess around 
+        # beam stopper and high q. This also speedups DBSCAN calculation
+        sel = (Q_arr > qrange[0]) & (Q_arr < qrange[1])            
+        I_arr = np.array([i[sel] for i in I_arr])
     else:
-        print('Q array is not provided. Using whole points')
+        print('Q array is not provided. Using all points')
+        
          
     # Use DBSCAN package to cluster I_arr
-    dbs   = DBSCAN(params['eps'], min_samples=params['min_samples'],
-                metric=lambda i, j: 1 - spearmanr(i, j)[0], n_jobs=params['n_jobs'])
+    dbs   = DBSCAN(eps, min_samples=min_samples,
+                metric=lambda i, j: 1 - spearmanr(i, j)[0], n_jobs=n_jobs)
     preds = dbs.fit_predict(np.array(I_arr))
     uniques, counts = np.unique(preds, return_counts=True)
     ratios = counts / sum(counts)
@@ -655,18 +650,18 @@ def find_sample_from_2dscan(I_arr, xy_arr, Q_arr=None, params=None):
     pts = []   
 
     for j,u in enumerate(uniques):
+        
         mask = (preds == u)        
         masked = []
         for i,tf in enumerate(mask):
             if tf:
                 masked.append(xy_arr[i])
-        # these are not classified
+                
         if u == -1:
-            if params['use_unclassified']:
+            if use_unclassified:
                 pts.extend(masked)
         else:
-            # these are possibly background
-            if not (ratios[j] >= params['s_ratio']):
+            if (ratios[j] <= b_ratio_thres):
                 pts.extend(masked)        
     pts = np.array(pts)
        

From adf90af5362cc5ef8852ad3244fd6c89d033d4cc Mon Sep 17 00:00:00 2001
From: christopher <cjwright4242@gmail.com>
Date: Wed, 26 Sep 2018 18:49:27 -0400
Subject: [PATCH 3/3] add sklearn to requirements

---
 score.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/score.yaml b/score.yaml
index 14f7e44..c0bb004 100644
--- a/score.yaml
+++ b/score.yaml
@@ -38,6 +38,8 @@ run:
     default: {conda: pims}
   tqdm:
     default: {conda: tqdm}
+  scikit-learn:
+    default: {conda: scikit-learn}
 test:
   pytest:
     default: {conda: pytest}
@@ -57,4 +59,4 @@ docs:
   sphinx:
     default: {conda: sphinx}
   sphinx_rtd_theme:
-    default: {conda: sphinx_rtd_theme}
\ No newline at end of file
+    default: {conda: sphinx_rtd_theme}