From dbcd1db8bfb3503e0c2f28c7dbf62d17048272b7 Mon Sep 17 00:00:00 2001 From: Jude Date: Tue, 31 Jan 2023 15:28:49 +0000 Subject: [PATCH 1/8] fix warning message --- ICSDClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ICSDClient.py b/ICSDClient.py index d6c2b62..7833ca7 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -180,7 +180,7 @@ def advanced_search(self, search_dict, search_type="or", property_list=["Collec self.session_history.append({search_string: response}) - soup = BeautifulSoup(response.content, "html.parser") + soup = BeautifulSoup(response.content, features="xml") search_results = soup.idnums.contents[0].split(" ") # search_results = [x for x in str(response.content).split("idnums")[1].split(" ")[1:-2]] From f4ad3766f4144eeef80e5f049aef6dc796822c11 Mon Sep 17 00:00:00 2001 From: Jude Date: Tue, 31 Jan 2023 21:03:03 +0000 Subject: [PATCH 2/8] concurrent cif download --- ICSDClient.py | 289 ++++++++++++++++++++++++++++---------------------- 1 file changed, 161 insertions(+), 128 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index 7833ca7..47917ab 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -3,6 +3,9 @@ import numpy as np import datetime import pandas as pd +from contextlib import contextmanager +from functools import partial +from concurrent.futures import ThreadPoolExecutor, as_completed import requests from bs4 import BeautifulSoup @@ -42,25 +45,130 @@ def main(): client.logout() -class ICSDClient(): - def __init__(self, login_id=None, password=None, windows_client=False, timeout=15): - self.auth_token = None - self.session_history = [] - self.windows_client = windows_client +class ICSDHelper: + MAX_CIFS = 500 + + def __init__(self, id, pwd, verbose=False): + self.id = id + self.pwd = pwd + self.query_mgr = ICSDClient() + self.token = None + self.verbose = verbose self.search_dict = self.load_search_dict() - self.timeout = timeout - if login_id is not None: - self.login_id = login_id - self.password = password - self.authorize() + def connect(self): + self.token = self.query_mgr.authorize(self.id, self.pwd) + + def close_connection(self): + self.query_mgr.logout(self.token) + self.token = None + + @contextmanager + def temp_connection(self): + try: + token = self.query_mgr.authorize(self.id, self.pwd) + yield token + finally: + self.query_mgr.logout(token) + + def __enter__(self): + self.connect() + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + self.close_connection() + + def search(self, search_dict, search_type="and"): + for k, v in search_dict.items(): + if k not in self.search_dict: + return f"Invalid search term {k} in search dict. Call client.search_dict.keys() to see available search terms" - def __del__(self): - self.logout() + elif v is None: + search_dict.pop(k) - def authorize(self, verbose=True): - data = {"loginid": self.login_id, - "password": self.password} + search_string = f" {search_type} ".join([f"{str(k)} : {str(v)}" for k, v in search_dict.items()]) + + return self.query_mgr.advanced_search(self.token, search_string) + + def fetch_cifs(self, ids): + def fetch_cif_batch(ids): + with self.temp_connection() as auth_token: + return self.query_mgr.fetch_cifs(auth_token, ids) + + batched_ids = [ids[i: i + self.MAX_CIFS] for i in range(0, len(ids), self.MAX_CIFS)] + + if self.verbose: + print(f'Fetching {len(ids)} cifs in {len(batched_ids)} batches.') + + with ThreadPoolExecutor(max_workers=8) as exec: + fut_to_ids = {exec.submit(fetch_cif_batch, batch): batch for i, batch in enumerate(batched_ids)} + for future in as_completed(fut_to_ids): + ids = fut_to_ids[future] + try: + result = future.result() + yield True, result + except Exception as e: + yield False, ids + + def load_search_dict(self): + search_dict = {"AUTHORS" : None, # BIBLIOGRAPHY : Authors name for the main (first) reference Text + "ARTICLE" : None, # BIBLIOGRAPHY : Title of article for the main (first) reference Text + "PUBLICATIONYEAR" : None, # BIBLIOGRAPHY : Year of publication of an article in the reference Numerical, integer + "PAGEFIRST" : None, # BIBLIOGRAPHY : First page number of an article in the referenceNumerical, integer + "JOURNAL" : None, # BIBLIOGRAPHY : Title of journal for the reference Text + "VOLUME" : None, # BIBLIOGRAPHY : Volume of the journal in the reference Numerical, integer + "ABSTRACT" : None, # BIBLIOGRAPHY : Abstract for the main (first) reference Text + "KEYWORDS" : None, # BIBLIOGRAPHY : Keywords for the main (first) reference Text + "CELLVOLUME" : None, # CELL SEARCH : Cell volumeNumerical, floating point + "CALCDENSITY" : None, # CELL SEARCH : Calculated density Numerical, floating poit + "CELLPARAMETERS" : None, # CELL SEARCH : Cell lenght a,b,c and angles alpha, beta, gamma separated by whitespace, i.e.: a b c alpha beta gamma, * if any value Numerical, floating point + "SEARCH" : None, # CELLDATACELL SEARCH : Restriction of cellparameters.experimental, reduced, standardized + "STRUCTUREDFORMULA" : None, # A CHEMISTRY SEARCH : Search for typical chemical groups Text + "CHEMICALNAME" : None, # CHEMISTRY SEARCH : Search for (parts of) the chemical name Text + "MINERALNAME" : None, # CHEMISTRY SEARCH : Search for the mineral name Text + "MINERALGROUP" : None, # CHEMISTRY SEARCH : Search for the mineral group Text + "ZVALUECHEMISTRY" : None, # SEARCH :Number of formula units per unit cell Numerical, integer + "ANXFORMULA" : None, # CHEMISTRY SEARCH : Search for the ANX formula Text + "ABFORMULA" : None, # CHEMISTRY SEARCH : Search for the AB formula Text + "FORMULAWEIGHT" : None, # CHEMISTRY SEARCH : Search for the formula weight Numerical, floating point + "NUMBEROFELEMENTS" : None, # CHEMISTRY SEARCH : Search for number of elementsinteger + "COMPOSITION" : None, # CHEMISTRY SEARCH : Search for the chemical composition (including stochiometric coefficients and/or oxidation numbers: EL:Co.(min):Co.(max):Ox.(min):Ox.(max)with El=element, Co=coefficient, Ox=oxidation number) Text + "COLLECTIONCODE" : None, # DB INFO : ICSD collection codeNumerical, integer + "PDFNUMBER" : None, # DB INFO : PDF number as assigned by ICDD Text + "RELEASE" : None, # DB INFO : Release tagNumerical, integer, special format + "RECORDINGDATE" : None, # DB INFO : Recording date of an ICSD entry Numerical, integer, special format + "MODIFICATIONDATE" : None, # DB INFO : Modification date of an ICSD entry Numerical, integer, special format + "COMMENT" : None, # EXPERIMENTAL SEARCH : Search for a comment Text + "RVALUE" : None, # EXPERIMENTAL SEARCH : R-value of the refinement (0.00 ... 1.00) Numerical, floating point + "TEMPERATURE" : None, # EXPERIMENTAL SEARCH : Temperature of the measurement Numerical, floating point + "PRESSURE" : None, # EXPERIMENTAL SEARCH : Pressure during the measurement Numerical, floating point + "SAMPLETYPE": None, # EXPERIMENTAL SEARCH : Search for the sample type: powder, singlecrystal + "RADIATIONTYPE": None, # EXPERIMENTAL SEARCH : Search for the radiation type: xray, electrons, neutrons, synchotron + "STRUCTURETYPE" : None, # STRUCTURE TYPE : Search for predefined structure types directly Select one + "SPACEGROUPSYMBOL" : None, # SYMMETRY : Search for the space group symbol Text + "SPACEGROUPNUMBER" : None, # SYMMETRY : Search for the space group number Numerical, integer + "BRAVAISLATTICE" : None, # SYMMETRY : Select One: Primitive, a-centered, b-centered, c-centered, Body-centered, Rhombohedral, Face-centered Select one + "CRYSTALSYSTEM" : None, # SYMMETRY : Crystal system Select one + "CRYSTALCLASS" : None, # SYMMETRY : Search for the crystal class Text + "LAUECLASS" : None, # SYMMETRY : Search for predefined Laueclass: -1, -3, -3m, 2/m, 4/m, 4/mmm ,6/m 6/mmm ,m-3 ,m-3m ,mmm Select one + "WYCKOFFSEQUENCE" : None, # SYMMETRY : Search for the Wyckoff sequence Text + "PEARSONSYMBOL" : None, # SYMMETRY : Search for the Pearson symbol Text + "INVERSIONCENTER" : None, # SYMMETRY : Should inversion center be included? TRUE or FALSE + "POLARAXIS" : None} # SYMMETRY : Should polar axis be included TRUE or FALSE + + return {k.lower(): v for k, v in search_dict.items()} + +class ICSDClient: + STATUS_OK = 200 + + def __init__(self, windows_client=False, timeout=15): + self.session_history = [] + self.windows_client = windows_client + self.timeout = timeout + + def authorize(self, id, pwd, verbose=True): + data = {"loginid": id, + "password": pwd} headers = { 'accept': 'text/plain', @@ -70,21 +178,20 @@ def authorize(self, verbose=True): response = requests.post('https://icsd.fiz-karlsruhe.de/ws/auth/login', headers=headers, data=data) + + self.session_history.append(response) - if response.status_code == 200: - self.auth_token = response.headers['ICSD-Auth-Token'] - if verbose: print(f"Authentication succeeded. Your Auth Token for this session is {self.auth_token} which will expire in one hour. Please remember to call client.logout() when you have finished.") + if response.status_code == self.STATUS_OK: + token = response.headers['ICSD-Auth-Token'] + if verbose: print(f"Authentication succeeded. Your Auth Token for this session is {token} which will expire in one hour.") + return token else: if verbose: print(response.content) - self.session_history.append(response) - - return response - - def logout(self, verbose=True): + def logout(self, auth_token, verbose=True): headers = { 'accept': 'text/plain', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } response = requests.get('https://icsd.fiz-karlsruhe.de/ws/auth/logout', headers=headers) @@ -113,11 +220,11 @@ def writeout(self, cifs, folder="./cifs/"): for line in cif.splitlines(): f.write(line + "\n") - def search(self, searchTerm, content_type=None): + def search(self, auth_token, searchTerm, content_type=None): ''' Available content EXPERIMENTAL_INORGANIC, EXPERIMENTAL_METALORGANIC, THERORETICAL_STRUCTURES ''' - if self.auth_token is None: + if auth_token is None: print("You are not authenticated, call client.authorize() first") return @@ -135,7 +242,7 @@ def search(self, searchTerm, content_type=None): headers = { 'accept': 'application/xml', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } response = requests.get('https://icsd.fiz-karlsruhe.de/ws/search/simple', @@ -151,16 +258,8 @@ def search(self, searchTerm, content_type=None): return list(zip(search_results, compositions)) - def advanced_search(self, search_dict, search_type="or", property_list=["CollectionCode", "StructuredFormula"]): - for k, v in search_dict.items(): - if k not in self.search_dict: - return f"Invalid search term {k} in search dict. Call client.search_dict.keys() to see available search terms" - - elif v is None: - search_dict.pop(k) - - search_string = f" {search_type} ".join([f"{str(k)} : {str(v)}" for k, v in search_dict.items()]) - + def advanced_search(self, auth_token, search_string): + # , property_list=["CollectionCode", "StructuredFormula"]): params = ( ('query', search_string), ('content type', "EXPERIMENTAL_INORGANIC"), @@ -168,7 +267,7 @@ def advanced_search(self, search_dict, search_type="or", property_list=["Collec headers = { 'accept': 'application/xml', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } response = requests.get('https://icsd.fiz-karlsruhe.de/ws/search/expert', @@ -182,13 +281,14 @@ def advanced_search(self, search_dict, search_type="or", property_list=["Collec soup = BeautifulSoup(response.content, features="xml") search_results = soup.idnums.contents[0].split(" ") + return search_results # search_results = [x for x in str(response.content).split("idnums")[1].split(" ")[1:-2]] - properties = self.fetch_data(search_results, property_list=property_list) + # properties = self.fetch_data(search_results, property_list=property_list) - return list(zip(search_results, properties)) + # return list(zip(search_results, properties)) - def fetch_data(self, ids, property_list=["CollectionCode", "StructuredFormula"]): + def fetch_data(self, auth_token, ids, property_list=["CollectionCode", "StructuredFormula"]): """ Available properties: CollectionCode, HMS, StructuredFormula, StructureType, Title, Authors, Reference, CellParameter, ReducedCellParameter, StandardizedCellParameter, @@ -206,8 +306,8 @@ def fetch_data(self, ids, property_list=["CollectionCode", "StructuredFormula"]) property_list=property_list)) if i % 2 == 0: - self.logout(verbose=False) - self.authorize(verbose=False) + self.logout(auth_token, verbose=False) + self.authorize(verbose=False) # TODO fails flattened = [item for sublist in return_responses for item in sublist] @@ -215,7 +315,7 @@ def fetch_data(self, ids, property_list=["CollectionCode", "StructuredFormula"]) headers = { 'accept': 'application/csv', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } params = ( @@ -239,14 +339,14 @@ def fetch_data(self, ids, property_list=["CollectionCode", "StructuredFormula"]) return data - def fetch_cif(self, id): - if self.auth_token is None: + def fetch_cif(self, auth_token, id): + if auth_token is None: print("You are not authenticated, call client.authorize() first") return headers = { 'accept': 'application/cif', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } params = ( @@ -260,38 +360,17 @@ def fetch_cif(self, id): return response.content.decode("UTF-8").strip() - def fetch_cifs(self, ids): - if self.auth_token is None: + def fetch_cifs(self, auth_token, ids): + if auth_token is None: print("You are not authenticated, call client.authorize() first") return if isinstance(ids[0], tuple): ids = [x[0] for x in ids] - if len(ids) > 500: - chunked_ids = np.array_split(ids, np.ceil(len(ids)/500)) - return_responses = [] - - for i, chunk in enumerate(chunked_ids): - if i % 2 == 0: - self.logout(verbose=False) - self.authorize(verbose=False) - - return_responses.append(self.fetch_cifs(chunk)) - - flattened = [item for sublist in return_responses for item in sublist] - - return_responses = ''.join(flattened) - - cifs = re.split("\(C\) 2021 by FIZ Karlsruhe", return_responses)[1:] - cifs = [f'(C) {datetime.date.today().strftime("%Y")} by FIZ Karlsruhe' + x for x in cifs] - cifs = [x.encode("UTF-8") for x in cifs] - - return cifs - headers = { 'accept': 'application/cif', - 'ICSD-Auth-Token': self.auth_token, + 'ICSD-Auth-Token': auth_token, } params = ( @@ -302,21 +381,22 @@ def fetch_cifs(self, ids): ) response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) + if response.status_code == self.STATUS_OK: + cifs = response.content.decode("UTF-8").split('#(C)')[1:] + return ['#(C)'+cif for cif in cifs] + else: + raise Exception('Failed to get cifs.') - cifs = re.split("\\(C\\) [0-9]{4} by FIZ Karlsruhe", response.content.decode("UTF-8"))[1:] - cifs = [f"(C) 2022 by FIZ Karlsruhe" + x for x in cifs] - - return cifs - - def fetch_all_cifs(self, cif_path="./cifs/"): + # TODO move out + def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): for x in range(0, 1000000, 500): - self.logout(verbose=False) - self.authorize(verbose=False) + self.logout(auth_token, verbose=False) + self.authorize(verbose=False) print(f"{x}-{x+499}") - search_res = self.advanced_search({"collectioncode": f"{x}-{x+499}"}) + search_res = self.advanced_search(auth_token, {"collectioncode": f"{x}-{x+499}"}) - cifs = self.fetch_cifs(search_res) + cifs = self.fetch_cifs(auth_token, search_res) try: print(cifs[0]) @@ -327,53 +407,6 @@ def fetch_all_cifs(self, cif_path="./cifs/"): self.writeout(cifs, cif_path) - def load_search_dict(self): - search_dict = {"AUTHORS" : None, # BIBLIOGRAPHY : Authors name for the main (first) reference Text - "ARTICLE" : None, # BIBLIOGRAPHY : Title of article for the main (first) reference Text - "PUBLICATIONYEAR" : None, # BIBLIOGRAPHY : Year of publication of an article in the reference Numerical, integer - "PAGEFIRST" : None, # BIBLIOGRAPHY : First page number of an article in the referenceNumerical, integer - "JOURNAL" : None, # BIBLIOGRAPHY : Title of journal for the reference Text - "VOLUME" : None, # BIBLIOGRAPHY : Volume of the journal in the reference Numerical, integer - "ABSTRACT" : None, # BIBLIOGRAPHY : Abstract for the main (first) reference Text - "KEYWORDS" : None, # BIBLIOGRAPHY : Keywords for the main (first) reference Text - "CELLVOLUME" : None, # CELL SEARCH : Cell volumeNumerical, floating point - "CALCDENSITY" : None, # CELL SEARCH : Calculated density Numerical, floating poit - "CELLPARAMETERS" : None, # CELL SEARCH : Cell lenght a,b,c and angles alpha, beta, gamma separated by whitespace, i.e.: a b c alpha beta gamma, * if any value Numerical, floating point - "SEARCH" : None, # CELLDATACELL SEARCH : Restriction of cellparameters.experimental, reduced, standardized - "STRUCTUREDFORMULA" : None, # A CHEMISTRY SEARCH : Search for typical chemical groups Text - "CHEMICALNAME" : None, # CHEMISTRY SEARCH : Search for (parts of) the chemical name Text - "MINERALNAME" : None, # CHEMISTRY SEARCH : Search for the mineral name Text - "MINERALGROUP" : None, # CHEMISTRY SEARCH : Search for the mineral group Text - "ZVALUECHEMISTRY" : None, # SEARCH :Number of formula units per unit cell Numerical, integer - "ANXFORMULA" : None, # CHEMISTRY SEARCH : Search for the ANX formula Text - "ABFORMULA" : None, # CHEMISTRY SEARCH : Search for the AB formula Text - "FORMULAWEIGHT" : None, # CHEMISTRY SEARCH : Search for the formula weight Numerical, floating point - "NUMBEROFELEMENTS" : None, # CHEMISTRY SEARCH : Search for number of elementsinteger - "COMPOSITION" : None, # CHEMISTRY SEARCH : Search for the chemical composition (including stochiometric coefficients and/or oxidation numbers: EL:Co.(min):Co.(max):Ox.(min):Ox.(max)with El=element, Co=coefficient, Ox=oxidation number) Text - "COLLECTIONCODE" : None, # DB INFO : ICSD collection codeNumerical, integer - "PDFNUMBER" : None, # DB INFO : PDF number as assigned by ICDD Text - "RELEASE" : None, # DB INFO : Release tagNumerical, integer, special format - "RECORDINGDATE" : None, # DB INFO : Recording date of an ICSD entry Numerical, integer, special format - "MODIFICATIONDATE" : None, # DB INFO : Modification date of an ICSD entry Numerical, integer, special format - "COMMENT" : None, # EXPERIMENTAL SEARCH : Search for a comment Text - "RVALUE" : None, # EXPERIMENTAL SEARCH : R-value of the refinement (0.00 ... 1.00) Numerical, floating point - "TEMPERATURE" : None, # EXPERIMENTAL SEARCH : Temperature of the measurement Numerical, floating point - "PRESSURE" : None, # EXPERIMENTAL SEARCH : Pressure during the measurement Numerical, floating point - "SAMPLETYPE": None, # EXPERIMENTAL SEARCH : Search for the sample type: powder, singlecrystal - "RADIATIONTYPE": None, # EXPERIMENTAL SEARCH : Search for the radiation type: xray, electrons, neutrons, synchotron - "STRUCTURETYPE" : None, # STRUCTURE TYPE : Search for predefined structure types directly Select one - "SPACEGROUPSYMBOL" : None, # SYMMETRY : Search for the space group symbol Text - "SPACEGROUPNUMBER" : None, # SYMMETRY : Search for the space group number Numerical, integer - "BRAVAISLATTICE" : None, # SYMMETRY : Select One: Primitive, a-centered, b-centered, c-centered, Body-centered, Rhombohedral, Face-centered Select one - "CRYSTALSYSTEM" : None, # SYMMETRY : Crystal system Select one - "CRYSTALCLASS" : None, # SYMMETRY : Search for the crystal class Text - "LAUECLASS" : None, # SYMMETRY : Search for predefined Laueclass: -1, -3, -3m, 2/m, 4/m, 4/mmm ,6/m 6/mmm ,m-3 ,m-3m ,mmm Select one - "WYCKOFFSEQUENCE" : None, # SYMMETRY : Search for the Wyckoff sequence Text - "PEARSONSYMBOL" : None, # SYMMETRY : Search for the Pearson symbol Text - "INVERSIONCENTER" : None, # SYMMETRY : Should inversion center be included? TRUE or FALSE - "POLARAXIS" : None} # SYMMETRY : Should polar axis be included TRUE or FALSE - - return {k.lower(): v for k, v in search_dict.items()} if __name__ == "__main__": main() From 159a899e49f7a5fa99b9f908346418b83a0edde6 Mon Sep 17 00:00:00 2001 From: Jude Date: Tue, 31 Jan 2023 21:44:06 +0000 Subject: [PATCH 3/8] concurrent data download --- ICSDClient.py | 163 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 59 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index 47917ab..062fab1 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -6,44 +6,56 @@ from contextlib import contextmanager from functools import partial from concurrent.futures import ThreadPoolExecutor, as_completed +from csv import DictWriter import requests from bs4 import BeautifulSoup def main(): - - client = ICSDClient("YOUR_USERNAME", "YOUR_PASSWORD") + def test(cli): + search_string = {"numberofelements": "1", 'composition': 'Fe'} + ids = cli.search(search_string) + print(len(ids)) + cli.data_to_csv(ids[0]) + # success, data = next(gen) # unpack generator + # print(f'successful?: {success}') + # print(data) - search_dict = {"collectioncode": "1-5000"} + with ICSDHelper('AVV9002682', 'icsd590') as cli: + test(cli) - search = client.advanced_search(search_dict, - property_list=["CollectionCode", "StructuredFormula","CalculatedDensity","MeasuredDensity","CellVolume"]) + # client = ICSDClient("YOUR_USERNAME", "YOUR_PASSWORD") + + # search_dict = {"collectioncode": "1-5000"} + + # search = client.advanced_search(search_dict, + # property_list=["CollectionCode", "StructuredFormula","CalculatedDensity","MeasuredDensity","CellVolume"]) - data=[] + # data=[] - for i,item in enumerate(search): - data.append([int(item[0]),int(item[1][0]),item[1][1],item[1][2],item[1][3],item[1][4]]) + # for i,item in enumerate(search): + # data.append([int(item[0]),int(item[1][0]),item[1][1],item[1][2],item[1][3],item[1][4]]) - pd_data=pd.DataFrame(data,columns=['DB_id','Col_code','name','cal_density', 'meas_density','cellvolume']) + # pd_data=pd.DataFrame(data,columns=['DB_id','Col_code','name','cal_density', 'meas_density','cellvolume']) - pd_data.to_csv('densities.csv',index=True) + # pd_data.to_csv('densities.csv',index=True) - # search_dict = {"collectioncode": "1-100"} + # # search_dict = {"collectioncode": "1-100"} - # search = client.advanced_search(search_dict) - # cifs = client.fetch_cifs(search) + # # search = client.advanced_search(search_dict) + # # cifs = client.fetch_cifs(search) - # x = client.search("Li O") - # cifs = client.fetch_cifs(search) + # # x = client.search("Li O") + # # cifs = client.fetch_cifs(search) - # client.fetch_all_cifs() + # # client.fetch_all_cifs() - # cif = client.fetch_cif(1) - # client.writeout(cif) + # # cif = client.fetch_cif(1) + # # client.writeout(cif) - client.logout() + # client.logout() class ICSDHelper: MAX_CIFS = 500 @@ -51,7 +63,7 @@ class ICSDHelper: def __init__(self, id, pwd, verbose=False): self.id = id self.pwd = pwd - self.query_mgr = ICSDClient() + self.query_mgr = ICSDClient(verbose) self.token = None self.verbose = verbose self.search_dict = self.load_search_dict() @@ -110,6 +122,57 @@ def fetch_cif_batch(ids): except Exception as e: yield False, ids + def fetch_data(self, ids, property_list=None): + def fetch_data_batch(ids, batch_idx): + query = partial( + self.query_mgr.fetch_data, + property_list=property_list) + + with self.temp_connection() as auth_token: + return query(auth_token, ids, batch_idx) + + batched_ids = [ids[i: i + self.MAX_CIFS] for i in range(0, len(ids), self.MAX_CIFS)] + + if self.verbose: + print(f'Fetching data for {len(ids)} items in {len(batched_ids)} batches.') + + with ThreadPoolExecutor(max_workers=8) as exec: + fut_to_ids = {exec.submit(fetch_data_batch, batch, i + 1): batch for i, batch in enumerate(batched_ids)} + for future in as_completed(fut_to_ids): + ids = fut_to_ids[future] + try: + result = future.result() + yield True, result # result = header, data + except Exception as e: + yield False, ids + + def data_to_csv(self, ids, output_folder='./output', output_file='icsd_data', columns=[]): + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + with open(os.path.join(output_folder, output_file+'.csv'), "w", newline='') as f: + first = True + failed_ids = [] + for success, result in self.fetch_data(ids, columns): + if success: + csv_header, csv_data = result[0], result[1] + if first: + writer = DictWriter(f, fieldnames=csv_header) + writer.writeheader() + first = False + for data in csv_data: + line = dict(zip(csv_header, data)) + line['CollectionCode'] = str(line['CollectionCode']).zfill(6) + writer.writerow(line) + else: + failed_ids.extend(result) + + if failed_ids: + with open(f'{output_file}_failed_to_download_ids.txt', 'w') as f: + for id in failed_ids: + f.write(id+'\n') + + def load_search_dict(self): search_dict = {"AUTHORS" : None, # BIBLIOGRAPHY : Authors name for the main (first) reference Text "ARTICLE" : None, # BIBLIOGRAPHY : Title of article for the main (first) reference Text @@ -161,10 +224,11 @@ def load_search_dict(self): class ICSDClient: STATUS_OK = 200 - def __init__(self, windows_client=False, timeout=15): + def __init__(self, verbose=False, windows_client=False, timeout=15): self.session_history = [] self.windows_client = windows_client self.timeout = timeout + self.verbose = verbose def authorize(self, id, pwd, verbose=True): data = {"loginid": id, @@ -259,7 +323,6 @@ def search(self, auth_token, searchTerm, content_type=None): return list(zip(search_results, compositions)) def advanced_search(self, auth_token, search_string): - # , property_list=["CollectionCode", "StructuredFormula"]): params = ( ('query', search_string), ('content type', "EXPERIMENTAL_INORGANIC"), @@ -282,13 +345,9 @@ def advanced_search(self, auth_token, search_string): soup = BeautifulSoup(response.content, features="xml") search_results = soup.idnums.contents[0].split(" ") return search_results - # search_results = [x for x in str(response.content).split("idnums")[1].split(" ")[1:-2]] - # properties = self.fetch_data(search_results, property_list=property_list) - - # return list(zip(search_results, properties)) - def fetch_data(self, auth_token, ids, property_list=["CollectionCode", "StructuredFormula"]): + def fetch_data(self, auth_token, ids, batch_idx=1, property_list = None): """ Available properties: CollectionCode, HMS, StructuredFormula, StructureType, Title, Authors, Reference, CellParameter, ReducedCellParameter, StandardizedCellParameter, @@ -297,47 +356,33 @@ def fetch_data(self, auth_token, ids, property_list=["CollectionCode", "Structur CalculatedDensity, MeasuredDensity, PearsonSymbol, WyckoffSequence, Journal, Volume, PublicationYear, Page, Quality """ - if len(ids) > 500: - chunked_ids = np.array_split(ids, np.ceil(len(ids)/500)) - - return_responses = [] - for i, chunk in enumerate(chunked_ids): - return_responses.append(self.fetch_data(chunk, - property_list=property_list)) - - if i % 2 == 0: - self.logout(auth_token, verbose=False) - self.authorize(verbose=False) # TODO fails - - flattened = [item for sublist in return_responses for item in sublist] - - return flattened + def format_response(response): + output = response.content.decode("UTF-8") + header, *data = output.split('\n') + header = header.split() + if len(data) > 0 and data[-1] == '': # output ending with \n creates an empty entry after split('\n') + data.pop() + data = [line.split('\t') for line in data] + self.session_history.append({str(ids): data}) + return header, data + + if self.verbose: + print(f'Fetching data for {len(ids)} items (batch {batch_idx}).') headers = { 'accept': 'application/csv', 'ICSD-Auth-Token': auth_token, } - - params = ( + if property_list is None: property_list = [] + params = [ ('idnum', ids), - ('windowsclient', self.windows_client), - ('listSelection', property_list), - ) + ('windowsclient', False), + ('listSelection', ['CollectionCode', 'SumFormula', 'StructuredFormula'] + property_list)] response = requests.get('https://icsd.fiz-karlsruhe.de/ws/csv', headers=headers, params=params) + + return format_response(response) - data = str(response.content).split("\\t\\n")[1:-1] - - # If there's only a single response - if len(data) == 0 and len(ids) != 0: - data = str(response.content).split("\\t\\r\\n")[1:-1] - - if len(property_list) > 1: - data = [x.split("\\t") for x in data] - - self.session_history.append({str(ids): data}) - - return data def fetch_cif(self, auth_token, id): if auth_token is None: From 597eacd90f0b8e3dc85be438eea77822e1d17148 Mon Sep 17 00:00:00 2001 From: Jude Date: Tue, 31 Jan 2023 22:15:28 +0000 Subject: [PATCH 4/8] separate search string generator --- ICSDClient.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index 062fab1..b8eb1ef 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -13,10 +13,11 @@ def main(): def test(cli): - search_string = {"numberofelements": "1", 'composition': 'Fe'} + search_string = "numberofelements: 1 and composition: Fe" ids = cli.search(search_string) print(len(ids)) - cli.data_to_csv(ids[0]) + cli.data_to_csv(ids) + # cli.fetch_cifs(ids) # success, data = next(gen) # unpack generator # print(f'successful?: {success}') # print(data) @@ -90,7 +91,20 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, exc_traceback): self.close_connection() - def search(self, search_dict, search_type="and"): + def search(self, search_string): + if self.token: + try: + ids = self.query_mgr.advanced_search(self.token, search_string) + except ConnectionError as e: + self.connect() # second attempt since stored token was rejected. + ids = self.query_mgr.advanced_search(self.token, search_string) + else: + with self.temp_connection() as auth_token: + ids = self.query_mgr.advanced_search(self.token, search_string) + + return ids + + def build_search_string(self, search_dict, search_type='or'): for k, v in search_dict.items(): if k not in self.search_dict: return f"Invalid search term {k} in search dict. Call client.search_dict.keys() to see available search terms" @@ -99,8 +113,7 @@ def search(self, search_dict, search_type="and"): search_dict.pop(k) search_string = f" {search_type} ".join([f"{str(k)} : {str(v)}" for k, v in search_dict.items()]) - - return self.query_mgr.advanced_search(self.token, search_string) + return search_string def fetch_cifs(self, ids): def fetch_cif_batch(ids): @@ -432,24 +445,12 @@ def fetch_cifs(self, auth_token, ids): else: raise Exception('Failed to get cifs.') - # TODO move out - def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): - for x in range(0, 1000000, 500): - self.logout(auth_token, verbose=False) - self.authorize(verbose=False) - - print(f"{x}-{x+499}") - search_res = self.advanced_search(auth_token, {"collectioncode": f"{x}-{x+499}"}) - - cifs = self.fetch_cifs(auth_token, search_res) - - try: - print(cifs[0]) - print(cifs[-1]) - except: - print("\n\nNO CIFS RETURNED, LAST RESPONSE:\n") - print(self.session_history[-1].content) - +def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): + max_coll_code = 1_000_000 + with ICSDHelper() as cli: + search_string = f"collectioncode=0-{max_coll_code}" + ids = cli.search(search_string) + for success, cifs in cli.fetch_cifs(ids): self.writeout(cifs, cif_path) From abfe84006950e399e289a615778d00d3127c2806 Mon Sep 17 00:00:00 2001 From: Jude Date: Tue, 31 Jan 2023 23:02:43 +0000 Subject: [PATCH 5/8] output cifs to zip --- ICSDClient.py | 85 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index b8eb1ef..8daae39 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -7,6 +7,8 @@ from functools import partial from concurrent.futures import ThreadPoolExecutor, as_completed from csv import DictWriter +import zipfile +import io import requests from bs4 import BeautifulSoup @@ -16,13 +18,14 @@ def test(cli): search_string = "numberofelements: 1 and composition: Fe" ids = cli.search(search_string) print(len(ids)) - cli.data_to_csv(ids) + # cli.data_to_csv(ids) + cli.cifs_to_zip(ids, 'test_search') # cli.fetch_cifs(ids) # success, data = next(gen) # unpack generator # print(f'successful?: {success}') # print(data) - with ICSDHelper('AVV9002682', 'icsd590') as cli: + with ICSDHelper('AVV9002682', 'icsd590', verbose=True) as cli: test(cli) # client = ICSDClient("YOUR_USERNAME", "YOUR_PASSWORD") @@ -115,10 +118,15 @@ def build_search_string(self, search_dict, search_type='or'): search_string = f" {search_type} ".join([f"{str(k)} : {str(v)}" for k, v in search_dict.items()]) return search_string - def fetch_cifs(self, ids): - def fetch_cif_batch(ids): + def fetch_cifs(self, ids, zip=False, output_file='icsd'): + def fetch_cif_batch(ids, batch_idx): + query = partial( + self.query_mgr.fetch_cifs, + zip = zip, + output_file = output_file) + with self.temp_connection() as auth_token: - return self.query_mgr.fetch_cifs(auth_token, ids) + return query(auth_token, ids, batch_idx) batched_ids = [ids[i: i + self.MAX_CIFS] for i in range(0, len(ids), self.MAX_CIFS)] @@ -126,15 +134,47 @@ def fetch_cif_batch(ids): print(f'Fetching {len(ids)} cifs in {len(batched_ids)} batches.') with ThreadPoolExecutor(max_workers=8) as exec: - fut_to_ids = {exec.submit(fetch_cif_batch, batch): batch for i, batch in enumerate(batched_ids)} + fut_to_ids = {exec.submit(fetch_cif_batch, batch, i + 1): batch for i, batch in enumerate(batched_ids)} for future in as_completed(fut_to_ids): ids = fut_to_ids[future] try: result = future.result() yield True, result except Exception as e: + raise e yield False, ids + def cifs_to_zip(self, ids, output_folder='./output', output_file='icsd'): + def copy_all(from_zip, to_zip): + for fname in from_zip.namelist(): + with from_zip.open(fname) as next_file: + # file name is provided as ``output_file``_CollCode{ccode}.cif + # extract {ccode} and fix length to 6 digits + ccode = fname[len(output_file) + 9: -4] + ccode = f"{int(ccode):06}" + bio = io.BytesIO(next_file.read()) + to_zip.writestr(f"{output_file}_{ccode}", bio.getvalue()) + + if not os.path.exists(output_folder): + os.makedirs(output_folder) + results_file = os.path.join(output_folder, output_file+'_results.zip') + failed_file = os.path.join(output_folder, output_file+'_failed_to_download_ids.txt') + + failed_ids = [] + with zipfile.ZipFile(results_file, mode='w') as archive: + # with zipfile.ZipFile(f'{output_file}_results.zip', mode='w') as archive: + for success, result in self.fetch_cifs(ids, zip=True, output_file=output_file): + if success: + with zipfile.ZipFile(io.BytesIO(result)) as zf1: + copy_all(zf1, archive) + else: + failed_ids.extend(result) + + if failed_ids: + with open(failed_file, 'w') as f: + for id in failed_ids: + f.write(id+'\n') + def fetch_data(self, ids, property_list=None): def fetch_data_batch(ids, batch_idx): query = partial( @@ -418,32 +458,43 @@ def fetch_cif(self, auth_token, id): return response.content.decode("UTF-8").strip() - def fetch_cifs(self, auth_token, ids): + def fetch_cifs(self, auth_token, ids, batch_idx = 1, zip = False, output_file='icsd'): if auth_token is None: print("You are not authenticated, call client.authorize() first") return if isinstance(ids[0], tuple): ids = [x[0] for x in ids] - + + if self.verbose: + print(f'Fetching {len(ids)} cifs (batch {batch_idx}).') + headers = { 'accept': 'application/cif', 'ICSD-Auth-Token': auth_token, } - params = ( + params = [ ('idnum', ids), ('celltype', 'experimental'), ('windowsclient', self.windows_client), - ('filetype', 'cif'), - ) - - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) - if response.status_code == self.STATUS_OK: - cifs = response.content.decode("UTF-8").split('#(C)')[1:] - return ['#(C)'+cif for cif in cifs] + ] + if zip: + params.append(('filename', output_file)) + params.append(('filetype', 'zip')) + response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) + if response.status_code == self.STATUS_OK: + return response.content + else: + raise Exception('Failed to get cifs.') else: - raise Exception('Failed to get cifs.') + params.append(('filetype', 'cif')) + response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) + if response.status_code == self.STATUS_OK: + cifs = response.content.decode("UTF-8").split('#(C)')[1:] + return ['#(C)'+cif for cif in cifs] + else: + raise Exception('Failed to get cifs.') def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): max_coll_code = 1_000_000 From d9702ec7af819d5ca70ac3de84e12cbaf8b460ec Mon Sep 17 00:00:00 2001 From: Jude Date: Wed, 1 Feb 2023 01:09:45 +0000 Subject: [PATCH 6/8] tidying up --- ICSDClient.py | 212 ++++++++++++++++++-------------------------------- 1 file changed, 77 insertions(+), 135 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index 8daae39..bb88357 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -1,66 +1,14 @@ import os -import re -import numpy as np -import datetime -import pandas as pd from contextlib import contextmanager from functools import partial from concurrent.futures import ThreadPoolExecutor, as_completed from csv import DictWriter import zipfile import io - +from time import sleep import requests from bs4 import BeautifulSoup -def main(): - def test(cli): - search_string = "numberofelements: 1 and composition: Fe" - ids = cli.search(search_string) - print(len(ids)) - # cli.data_to_csv(ids) - cli.cifs_to_zip(ids, 'test_search') - # cli.fetch_cifs(ids) - # success, data = next(gen) # unpack generator - # print(f'successful?: {success}') - # print(data) - - with ICSDHelper('AVV9002682', 'icsd590', verbose=True) as cli: - test(cli) - - # client = ICSDClient("YOUR_USERNAME", "YOUR_PASSWORD") - - # search_dict = {"collectioncode": "1-5000"} - - # search = client.advanced_search(search_dict, - # property_list=["CollectionCode", "StructuredFormula","CalculatedDensity","MeasuredDensity","CellVolume"]) - - # data=[] - - # for i,item in enumerate(search): - # data.append([int(item[0]),int(item[1][0]),item[1][1],item[1][2],item[1][3],item[1][4]]) - - - # pd_data=pd.DataFrame(data,columns=['DB_id','Col_code','name','cal_density', 'meas_density','cellvolume']) - - # pd_data.to_csv('densities.csv',index=True) - - - # # search_dict = {"collectioncode": "1-100"} - - # # search = client.advanced_search(search_dict) - # # cifs = client.fetch_cifs(search) - - # # x = client.search("Li O") - # # cifs = client.fetch_cifs(search) - - # # client.fetch_all_cifs() - - # # cif = client.fetch_cif(1) - # # client.writeout(cif) - - # client.logout() - class ICSDHelper: MAX_CIFS = 500 @@ -106,6 +54,9 @@ def search(self, search_string): ids = self.query_mgr.advanced_search(self.token, search_string) return ids + + def basic_search(self, query): + ids = self.query_mgr.search(self.token, query) def build_search_string(self, search_dict, search_type='or'): for k, v in search_dict.items(): @@ -141,7 +92,7 @@ def fetch_cif_batch(ids, batch_idx): result = future.result() yield True, result except Exception as e: - raise e + # raise e yield False, ids def cifs_to_zip(self, ids, output_folder='./output', output_file='icsd'): @@ -162,7 +113,6 @@ def copy_all(from_zip, to_zip): failed_ids = [] with zipfile.ZipFile(results_file, mode='w') as archive: - # with zipfile.ZipFile(f'{output_file}_results.zip', mode='w') as archive: for success, result in self.fetch_cifs(ids, zip=True, output_file=output_file): if success: with zipfile.ZipFile(io.BytesIO(result)) as zf1: @@ -275,6 +225,7 @@ def load_search_dict(self): return {k.lower(): v for k, v in search_dict.items()} class ICSDClient: + url = 'https://icsd.fiz-karlsruhe.de/ws/' STATUS_OK = 200 def __init__(self, verbose=False, windows_client=False, timeout=15): @@ -284,59 +235,39 @@ def __init__(self, verbose=False, windows_client=False, timeout=15): self.verbose = verbose def authorize(self, id, pwd, verbose=True): - data = {"loginid": id, - "password": pwd} + data = {"loginid": id, "password": pwd} + headers = {'accept': 'text/plain', 'Content-Type': 'application/x-www-form-urlencoded'} - headers = { - 'accept': 'text/plain', - 'Content-Type': 'application/x-www-form-urlencoded', - } - - response = requests.post('https://icsd.fiz-karlsruhe.de/ws/auth/login', - headers=headers, - data=data) - - self.session_history.append(response) + attempts = 1 + while attempts <= 5: + response = requests.post(self.url+'auth/login', headers=headers, data=data) + self.session_history.append(response) - if response.status_code == self.STATUS_OK: - token = response.headers['ICSD-Auth-Token'] - if verbose: print(f"Authentication succeeded. Your Auth Token for this session is {token} which will expire in one hour.") - return token + if response.status_code == self.STATUS_OK: + token = response.headers['ICSD-Auth-Token'] + if self.verbose: + print(f'Login successful. auth token: {token}.') + return token + else: # try again -- TODO should depend on reason for failure + sleep(0.1) + if self.verbose: + print(f'Login attempt {attempts} failed.') + attempts += 1 else: - if verbose: print(response.content) - + if self.verbose: + print('Login failed.') + def logout(self, auth_token, verbose=True): - headers = { - 'accept': 'text/plain', - 'ICSD-Auth-Token': auth_token, - } + headers = {'accept': 'text/plain', 'ICSD-Auth-Token': auth_token,} - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/auth/logout', headers=headers) - if verbose: print(response.content) + response = requests.get(self.url+'auth/logout', headers=headers) + if self.verbose: + print(f'Logout using token {auth_token}. Status: {response.status_code}: {response.content.decode("UTF-8")}.') self.session_history.append(response) return response - def writeout(self, cifs, folder="./cifs/"): - if not os.path.exists(folder): - os.makedirs(folder) - - if not isinstance(cifs, list): - if cifs is None: - print("Requires a valid cif string, this string is None. Ensure download was successful") - return - - cifs = [cifs] - - for cif in cifs: - icsd_code = re.search(r"_database_code_ICSD ([0-9]+)", cif).group(1) - filename = f"icsd_{int(icsd_code):06}.cif" - - with open(os.path.join(folder, filename), "w") as f: - for line in cif.splitlines(): - f.write(line + "\n") - def search(self, auth_token, searchTerm, content_type=None): ''' Available content EXPERIMENTAL_INORGANIC, EXPERIMENTAL_METALORGANIC, THERORETICAL_STRUCTURES @@ -362,7 +293,7 @@ def search(self, auth_token, searchTerm, content_type=None): 'ICSD-Auth-Token': auth_token, } - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/search/simple', + response = requests.get(self.url+'search/simple', headers=headers, params=params, timeout=self.timeout) @@ -376,29 +307,35 @@ def search(self, auth_token, searchTerm, content_type=None): return list(zip(search_results, compositions)) def advanced_search(self, auth_token, search_string): - params = ( - ('query', search_string), - ('content type', "EXPERIMENTAL_INORGANIC"), - ) - - headers = { - 'accept': 'application/xml', - 'ICSD-Auth-Token': auth_token, - } + def format_response(response): + return_data = BeautifulSoup(response.content, features="xml") + try: ret = return_data.idnums.contents[0].split(" ") + except IndexError: ret = return_data.idnums.contents + if self.verbose: + print(f'Search returned {len(ret)} values.') + return ret + + if self.verbose: + print(f'Performing search {search_string}.') + + params = (('query', search_string),('content type', "EXPERIMENTAL_INORGANIC")) + headers = {'accept': 'application/xml', 'ICSD-Auth-Token': auth_token} - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/search/expert', + response = requests.get(self.url+'search/expert', headers=headers, params=params, timeout=self.timeout) - # TODO add exception handling for timeouts - self.session_history.append({search_string: response}) - soup = BeautifulSoup(response.content, features="xml") - search_results = soup.idnums.contents[0].split(" ") - return search_results - + # TODO add exception handling for timeouts + if response.status_code == self.STATUS_OK: + return format_response(response) + else: + if response.status_code == self.STATUS_NOAUTH: + raise ConnectionError('Authenication token {auth_token} refused.') + if self.verbose: + print(f'Search failed. Status code {response.status_code}') def fetch_data(self, auth_token, ids, batch_idx=1, property_list = None): """ @@ -422,18 +359,15 @@ def format_response(response): if self.verbose: print(f'Fetching data for {len(ids)} items (batch {batch_idx}).') - headers = { - 'accept': 'application/csv', - 'ICSD-Auth-Token': auth_token, - } + headers = {'accept': 'application/csv', 'ICSD-Auth-Token': auth_token} + if property_list is None: property_list = [] params = [ ('idnum', ids), - ('windowsclient', False), + ('windowsclient', self.windows_client), ('listSelection', ['CollectionCode', 'SumFormula', 'StructuredFormula'] + property_list)] - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/csv', headers=headers, params=params) - + response = requests.get(self.url+'csv', headers=headers, params=params) return format_response(response) @@ -452,7 +386,7 @@ def fetch_cif(self, auth_token, id): ('windowsclient', self.windows_client), ) - response = requests.get(f'https://icsd.fiz-karlsruhe.de/ws/cif/{id}', headers=headers, params=params) + response = requests.get(f'{self.url}{id}', headers=headers, params=params) self.session_history.append({id: response}) @@ -469,41 +403,49 @@ def fetch_cifs(self, auth_token, ids, batch_idx = 1, zip = False, output_file='i if self.verbose: print(f'Fetching {len(ids)} cifs (batch {batch_idx}).') - headers = { - 'accept': 'application/cif', - 'ICSD-Auth-Token': auth_token, - } + headers = {'accept': 'application/cif', 'ICSD-Auth-Token': auth_token} params = [ ('idnum', ids), ('celltype', 'experimental'), ('windowsclient', self.windows_client), ] + if zip: params.append(('filename', output_file)) params.append(('filetype', 'zip')) - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) + response = requests.get(self.url+'cif/multiple', headers=headers, params=params) if response.status_code == self.STATUS_OK: return response.content else: raise Exception('Failed to get cifs.') else: params.append(('filetype', 'cif')) - response = requests.get('https://icsd.fiz-karlsruhe.de/ws/cif/multiple', headers=headers, params=params) + response = requests.get(self.url+'cif/multiple', headers=headers, params=params) if response.status_code == self.STATUS_OK: cifs = response.content.decode("UTF-8").split('#(C)')[1:] return ['#(C)'+cif for cif in cifs] else: raise Exception('Failed to get cifs.') -def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): - max_coll_code = 1_000_000 - with ICSDHelper() as cli: - search_string = f"collectioncode=0-{max_coll_code}" + +def main(): + def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): + max_coll_code = 1_000_000 + with ICSDHelper() as cli: + search_string = f"collectioncode=0-{max_coll_code}" + ids = cli.search(search_string) + cli.cifs_to_zip(ids, 'test_search') + + def test(cli): + search_string = "numberofelements: 1 and composition: Fe" ids = cli.search(search_string) - for success, cifs in cli.fetch_cifs(ids): - self.writeout(cifs, cif_path) + print(len(ids)) + cli.data_to_csv(ids) + cli.cifs_to_zip(ids, 'test_search') + with ICSDHelper("YOUR USERNAME", "YOUR PASSWORD", verbose=True) as cli: + test(cli) if __name__ == "__main__": main() From b2e0dd690e0ceac1b20e33e55ffa4c4cd4089900 Mon Sep 17 00:00:00 2001 From: Jude Date: Wed, 1 Feb 2023 01:19:51 +0000 Subject: [PATCH 7/8] tidying up --- ICSDClient.py | 61 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index bb88357..e5d2eef 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -429,21 +429,58 @@ def fetch_cifs(self, auth_token, ids, batch_idx = 1, zip = False, output_file='i raise Exception('Failed to get cifs.') -def main(): - def fetch_all_cifs(self, auth_token, cif_path="./cifs/"): - max_coll_code = 1_000_000 - with ICSDHelper() as cli: - search_string = f"collectioncode=0-{max_coll_code}" - ids = cli.search(search_string) - cli.cifs_to_zip(ids, 'test_search') - - def test(cli): - search_string = "numberofelements: 1 and composition: Fe" + +# examples +def test(cli: ICSDHelper): + search_string = "numberofelements: 1 and composition: Fe" + ids = cli.search(search_string) + print(len(ids)) + cli.data_to_csv(ids) + cli.cifs_to_zip(ids, 'test_search') + +def fetch_all_cifs(): + max_coll_code = 1_000_000 + search_string = f"collectioncode=0-{max_coll_code}" + with ICSDHelper() as cli: ids = cli.search(search_string) - print(len(ids)) - cli.data_to_csv(ids) cli.cifs_to_zip(ids, 'test_search') +def intermetallics(cli: ICSDHelper): + non_metals = {'H', 'D', 'T', 'He', + 'B', 'C', 'N', 'O', 'F', 'Ne', + 'Si', 'P', 'S', 'Cl', 'Ar', + # 'Ge', + 'As', 'Se', 'Br', 'Kr', + # 'Sb', + 'Te', 'I', 'Xe', + # 'Po', + 'At', 'Rn', + 'Ts', 'Og'} + + include_nm = ' or '.join([f'composition: {el}' for el in non_metals]) + exclude_nm = 'not (' + include_nm + ')' + search_string = 'numberofelements: >=2 ' + exclude_nm + + ids = cli.search(search_string) + cli.data_to_csv( + ids, + 'intermetallics_data', + columns = ['StructuredFormula', 'ChemicalName']) + cli.cifs_to_zip(ids, 'intermetallics_search') + +def minerals(cli: ICSDHelper): + search_string = "mineralname: *" + search_string = 'numberofelements: >=2 and ' + search_string + + ids = cli.search(search_string) + cli.data_to_csv( + ids, + 'minerals_data2', + columns = ['StructuredFormula', 'ChemicalName', 'MineralName', 'MineralGroup']) + cli.cifs_to_zip(ids, 'minerals_search2') + + +def main(): with ICSDHelper("YOUR USERNAME", "YOUR PASSWORD", verbose=True) as cli: test(cli) From 32a561f665b2eb95e9ed35af673913287474865d Mon Sep 17 00:00:00 2001 From: Jude Date: Thu, 2 Feb 2023 13:38:46 +0000 Subject: [PATCH 8/8] improve exception handling --- ICSDClient.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/ICSDClient.py b/ICSDClient.py index e5d2eef..f5cb672 100644 --- a/ICSDClient.py +++ b/ICSDClient.py @@ -20,20 +20,23 @@ def __init__(self, id, pwd, verbose=False): self.verbose = verbose self.search_dict = self.load_search_dict() - def connect(self): + def connect(self): self.token = self.query_mgr.authorize(self.id, self.pwd) def close_connection(self): - self.query_mgr.logout(self.token) + if self.token: + self.query_mgr.logout(self.token) self.token = None @contextmanager def temp_connection(self): + token = None try: token = self.query_mgr.authorize(self.id, self.pwd) yield token finally: - self.query_mgr.logout(token) + if token: + self.query_mgr.logout(token) def __enter__(self): self.connect() @@ -46,7 +49,7 @@ def search(self, search_string): if self.token: try: ids = self.query_mgr.advanced_search(self.token, search_string) - except ConnectionError as e: + except ConnectionRefusedError as e: self.connect() # second attempt since stored token was rejected. ids = self.query_mgr.advanced_search(self.token, search_string) else: @@ -227,6 +230,7 @@ def load_search_dict(self): class ICSDClient: url = 'https://icsd.fiz-karlsruhe.de/ws/' STATUS_OK = 200 + STATUS_NOAUTH = 401 def __init__(self, verbose=False, windows_client=False, timeout=15): self.session_history = [] @@ -254,15 +258,14 @@ def authorize(self, id, pwd, verbose=True): print(f'Login attempt {attempts} failed.') attempts += 1 else: - if self.verbose: - print('Login failed.') + raise ConnectionRefusedError(f'Unable to log in with id {id} and password {pwd}.') def logout(self, auth_token, verbose=True): headers = {'accept': 'text/plain', 'ICSD-Auth-Token': auth_token,} response = requests.get(self.url+'auth/logout', headers=headers) if self.verbose: - print(f'Logout using token {auth_token}. Status: {response.status_code}: {response.content.decode("UTF-8")}.') + print(f'Logout using token {auth_token}. Status: {response.status_code}, {response.content.decode("UTF-8")}.') self.session_history.append(response) @@ -333,7 +336,7 @@ def format_response(response): return format_response(response) else: if response.status_code == self.STATUS_NOAUTH: - raise ConnectionError('Authenication token {auth_token} refused.') + raise ConnectionRefusedError('Authenication token {auth_token} refused.') if self.verbose: print(f'Search failed. Status code {response.status_code}') @@ -434,7 +437,6 @@ def fetch_cifs(self, auth_token, ids, batch_idx = 1, zip = False, output_file='i def test(cli: ICSDHelper): search_string = "numberofelements: 1 and composition: Fe" ids = cli.search(search_string) - print(len(ids)) cli.data_to_csv(ids) cli.cifs_to_zip(ids, 'test_search')