17
17
import pandas as pd
18
18
19
19
from geofetch .cli import _parse_cmdl
20
- from geofetch .const import *
20
+ from geofetch .const import (
21
+ GSE_PATTERN ,
22
+ SAMPLE_SUPP_METADATA_FILE ,
23
+ EXP_SUPP_METADATA_FILE ,
24
+ NEW_GENOME_COL_NAME ,
25
+ FILE_RAW_NAME_SAMPLE_PATTERN ,
26
+ FILE_RAW_NAME_SUBSAMPLE_PATTERN ,
27
+ CONFIG_RAW_TEMPLATE_NAME ,
28
+ CONFIG_SRA_TEMPLATE ,
29
+ CONFIG_PROCESSED_TEMPLATE_NAME ,
30
+ NUM_RETRIES ,
31
+ SER_SUPP_FILE_PATTERN ,
32
+ SUPP_FILE_PATTERN ,
33
+ PROJECT_PATTERN ,
34
+ NCBI_EFETCH ,
35
+ NCBI_ESEARCH ,
36
+ EXPERIMENT_PATTERN ,
37
+ )
21
38
from geofetch .utils import (
22
39
Accession ,
23
40
build_prefetch_command ,
@@ -480,8 +497,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
480
497
file_gse_content , gsm_metadata , file_sra
481
498
)
482
499
if not srp_list_result :
483
- _LOGGER .info (f "No SRP data, continuing ...." )
484
- _LOGGER .warning (f "No raw pep will be created! ...." )
500
+ _LOGGER .info ("No SRP data, continuing ...." )
501
+ _LOGGER .warning ("No raw pep will be created! ...." )
485
502
# delete current acc if no raw data was found
486
503
# del metadata_dict[acc_GSE]
487
504
pass
@@ -498,7 +515,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
498
515
_LOGGER .info (f"Getting SRR: { run } in ({ acc_GSE } )" )
499
516
self ._download_raw_data (run )
500
517
else :
501
- _LOGGER .info (f "Dry run, no data will be downloaded" )
518
+ _LOGGER .info ("Dry run, no data will be downloaded" )
502
519
503
520
# save one project
504
521
if self .acc_anno and nkeys > 1 :
@@ -517,7 +534,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
517
534
518
535
# Logging cleaning process:
519
536
if self .discard_soft :
520
- _LOGGER .info (f "Cleaning soft files ..." )
537
+ _LOGGER .info ("Cleaning soft files ..." )
521
538
clean_soft_files (self .metadata_root_full )
522
539
523
540
#######################################################################################
@@ -878,7 +895,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
878
895
if element_is_list :
879
896
for n_elem in range (len (metadata_list )):
880
897
try :
881
- if type (metadata_list [n_elem ][dict_key ]) is not list :
898
+ if not isinstance (metadata_list [n_elem ][dict_key ], list ) :
882
899
metadata_list [n_elem ][dict_key ] = [
883
900
metadata_list [n_elem ][dict_key ]
884
901
]
@@ -930,7 +947,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
930
947
metadata_list [n_elem ][dict_key ] = this_string
931
948
else :
932
949
del metadata_list [n_elem ][dict_key ]
933
- except KeyError as err :
950
+ except KeyError :
934
951
# _LOGGER.warning(
935
952
# f"expand_metadata_list: Key Error: {err}, continuing ..."
936
953
# )
@@ -980,6 +997,7 @@ def _write_processed_annotation(
980
997
) -> Union [NoReturn , peppy .Project ]:
981
998
"""
982
999
Save annotation file by providing list of dictionaries with files metadata
1000
+
983
1001
:param list processed_metadata: list of dictionaries with files metadata
984
1002
:param str file_annotation_path: the path to the metadata file that has to be saved
985
1003
:param just_object: True, if you want to get peppy object without saving file
@@ -1046,13 +1064,14 @@ def _write_processed_annotation(
1046
1064
proj = peppy .Project ().from_pandas (pd_value , config = conf )
1047
1065
proj_exp_data = conf .get ("experiment_metadata" )
1048
1066
if proj_exp_data :
1049
- proj [ " description" ] = proj_exp_data .get ("series_title" )
1067
+ proj . description = proj_exp_data .get ("series_title" )
1050
1068
return proj
1051
1069
1052
1070
@staticmethod
1053
1071
def _find_genome (metadata_list : list ) -> list :
1054
1072
"""
1055
1073
Create new genome column by searching joining few columns
1074
+
1056
1075
:param metadata_list: list with metadata dict
1057
1076
:return: list with metadata dict where genome column was added
1058
1077
"""
@@ -1080,6 +1099,7 @@ def _write_raw_annotation_new(
1080
1099
"""
1081
1100
Combine individual accessions into project-level annotations, and writing
1082
1101
individual accession files (if requested)
1102
+
1083
1103
:param name: Name of the run, project, or acc --> will influence name of the folder where project will be created
1084
1104
:param metadata_dict: dictionary of sample annotations
1085
1105
:param subannot_dict: dictionary of subsample annotations
@@ -1128,7 +1148,7 @@ def _write_raw_annotation_new(
1128
1148
f"subsample_table: { os .path .basename (proj_root_subsample )} "
1129
1149
)
1130
1150
else :
1131
- subanot_path_yaml = f ""
1151
+ subanot_path_yaml = ""
1132
1152
1133
1153
template = self ._create_config_raw (
1134
1154
proj_meta , proj_root_sample , subanot_path_yaml , gse_meta_dict
@@ -1166,7 +1186,7 @@ def _write_raw_annotation_new(
1166
1186
proj = peppy .Project ().from_pandas (meta_df , sub_meta_df , conf )
1167
1187
proj_exp_data = conf .get ("experiment_metadata" )
1168
1188
if proj_exp_data :
1169
- proj [ " description" ] = proj_exp_data .get ("series_title" )
1189
+ proj . description = proj_exp_data .get ("series_title" )
1170
1190
return proj
1171
1191
1172
1192
def _create_config_processed (
@@ -1177,6 +1197,7 @@ def _create_config_processed(
1177
1197
) -> str :
1178
1198
"""
1179
1199
Compose and generate config file content
1200
+
1180
1201
:param file_annotation_path: root to the annotation file
1181
1202
:param proj_meta: common metadata that has to added to config file
1182
1203
:param meta_in_series:
@@ -1218,6 +1239,7 @@ def _create_config_raw(
1218
1239
):
1219
1240
"""
1220
1241
Compose and generate config file content for raw data
1242
+
1221
1243
:param proj_meta: root to the annotation file
1222
1244
:param proj_root_sample: path to sampletable file
1223
1245
:param subanot_path_yaml: path to subannotation file
@@ -1275,6 +1297,7 @@ def _check_sample_name_standard(metadata_dict: dict) -> dict:
1275
1297
"""
1276
1298
Standardize sample name and checking if it exists
1277
1299
(This function is used for raw data)
1300
+
1278
1301
:param metadata_dict: metadata dict
1279
1302
:return: metadata dict with standardize sample names
1280
1303
"""
@@ -1300,14 +1323,16 @@ def _separate_common_meta(
1300
1323
) -> tuple :
1301
1324
"""
1302
1325
Separate experiment(project) metadata from sample metadata
1326
+
1303
1327
:param list or dict meta_list: list of dictionaries of samples
1304
1328
:param int max_len: threshold of the length of the common value that can be stored in the sample table
1305
1329
:param int del_limit: threshold of the length of the common value that have to be deleted
1306
1330
:param int attr_limit_truncate: max length of the attribute in the sample csv
1307
1331
:return set: Return is a set of list, where 1 list (or dict) is
1308
- list of samples metadata dictionaries and 2: list of common samples metadata
1309
- dictionaries that are linked to the project.
1332
+ list of samples metadata dictionaries and 2: list of common samples metadata
1333
+ dictionaries that are linked to the project.
1310
1334
"""
1335
+
1311
1336
# check if meta_list is dict and converting it to list
1312
1337
input_is_dict = False
1313
1338
if isinstance (meta_list , dict ):
@@ -1401,6 +1426,7 @@ def _download_SRA_file(self, run_name: str):
1401
1426
def _sra_to_bam_conversion_sam_dump (self , bam_file : str , run_name : str ) -> NoReturn :
1402
1427
"""
1403
1428
Convert SRA file to BAM file by using samtools function "sam-dump"
1429
+
1404
1430
:param str bam_file: path to BAM file that has to be created
1405
1431
:param str run_name: SRR number of the SRA file that has to be converted
1406
1432
"""
@@ -1509,7 +1535,7 @@ def _download_file(
1509
1535
full_filepath = os .path .join (data_folder , new_name )
1510
1536
1511
1537
if not os .path .exists (full_filepath ):
1512
- _LOGGER .info (f "\033 [38;5;242m" ) # set color to gray
1538
+ _LOGGER .info ("\033 [38;5;242m" ) # set color to gray
1513
1539
# if dir does not exist:
1514
1540
if not os .path .exists (data_folder ):
1515
1541
os .makedirs (data_folder )
@@ -1518,7 +1544,7 @@ def _download_file(
1518
1544
)
1519
1545
_LOGGER .info (f"\033 [38;5;242m{ ret } \033 [0m" )
1520
1546
time .sleep (sleep_after )
1521
- _LOGGER .info (f "\033 [0m" ) # Reset to default terminal color
1547
+ _LOGGER .info ("\033 [0m" ) # Reset to default terminal color
1522
1548
else :
1523
1549
_LOGGER .info (f"\033 [38;5;242mFile { full_filepath } exists.\033 [0m" )
1524
1550
@@ -1545,7 +1571,7 @@ def _get_list_of_processed_files(
1545
1571
pl = parse_SOFT_line (line )
1546
1572
file_url = pl [list (pl .keys ())[0 ]].rstrip ()
1547
1573
filename = os .path .basename (file_url )
1548
- _LOGGER .debug (f"Processed GSE file found: %s" % str (file_url ))
1574
+ _LOGGER .debug (f"Processed GSE file found: { str (file_url )} " )
1549
1575
1550
1576
# search for tar file:
1551
1577
if tar_re .search (filename ):
@@ -1574,7 +1600,7 @@ def _get_list_of_processed_files(
1574
1600
)
1575
1601
1576
1602
else :
1577
- raise Exception (f "error in requesting tar_files_list" )
1603
+ raise Exception ("error in requesting tar_files_list" )
1578
1604
else :
1579
1605
_LOGGER .info (f"Found previous GSM file: { filelist_path } " )
1580
1606
filelist_obj = open (filelist_path , "r" )
@@ -1610,9 +1636,8 @@ def _get_list_of_processed_files(
1610
1636
):
1611
1637
meta_processed_samples [nb ].update (pl )
1612
1638
else :
1613
- if (
1614
- type (meta_processed_samples [nb ][element_keys ])
1615
- is not list
1639
+ if not isinstance (
1640
+ meta_processed_samples [nb ][element_keys ], list
1616
1641
):
1617
1642
meta_processed_samples [nb ][element_keys ] = [
1618
1643
meta_processed_samples [nb ][element_keys ]
@@ -1631,7 +1656,7 @@ def _get_list_of_processed_files(
1631
1656
pl = parse_SOFT_line (line_gsm )
1632
1657
file_url_gsm = pl [list (pl .keys ())[0 ]].rstrip ()
1633
1658
_LOGGER .debug (
1634
- f"Processed GSM file found: %s" % str (file_url_gsm )
1659
+ f"Processed GSM file found: { str (file_url_gsm )} "
1635
1660
)
1636
1661
if file_url_gsm != "NONE" :
1637
1662
meta_processed_samples [nb ]["files" ].append (file_url_gsm )
@@ -1643,8 +1668,7 @@ def _get_list_of_processed_files(
1643
1668
meta_processed_samples = _separate_file_url (meta_processed_samples )
1644
1669
1645
1670
_LOGGER .info (
1646
- f"\n Total number of processed SAMPLES files found is: "
1647
- f"%s" % str (len (meta_processed_samples ))
1671
+ f"\n Total number of processed SAMPLES files found is: { str (len (meta_processed_samples ))} "
1648
1672
)
1649
1673
1650
1674
# expand meta_processed_samples with information about type and size
@@ -1677,21 +1701,21 @@ def _get_list_of_processed_files(
1677
1701
if bl_key not in meta_processed_series .keys ():
1678
1702
meta_processed_series .update (bl )
1679
1703
else :
1680
- if type (meta_processed_series [bl_key ]) is not list :
1704
+ if not isinstance (meta_processed_series [bl_key ], list ) :
1681
1705
meta_processed_series [bl_key ] = [meta_processed_series [bl_key ]]
1682
1706
meta_processed_series [bl_key ].append (bl_value )
1683
1707
else :
1684
1708
meta_processed_series [bl_key ].append (bl_value )
1685
1709
except IndexError as ind_err :
1686
1710
_LOGGER .debug (
1687
- f"IndexError in adding value to meta_processed_series: %s" % ind_err
1711
+ f"IndexError in adding value to meta_processed_series: { ind_err } "
1688
1712
)
1689
1713
1690
1714
meta_processed_series = _separate_list_of_files (meta_processed_series )
1691
1715
meta_processed_series = _separate_file_url (meta_processed_series )
1692
1716
_LOGGER .info (
1693
1717
f"Total number of processed SERIES files found is: "
1694
- f"%s" % str (len (meta_processed_series ))
1718
+ f"{ str (len (meta_processed_series ))} "
1695
1719
)
1696
1720
if self .filter_re :
1697
1721
meta_processed_series = self ._run_filter (meta_processed_series )
@@ -1778,6 +1802,7 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool:
1778
1802
def _get_SRA_meta (self , file_gse_content : list , gsm_metadata , file_sra = None ):
1779
1803
"""
1780
1804
Parse out the SRA project identifier from the GSE file
1805
+
1781
1806
:param list file_gse_content: list of content of file_sde_content
1782
1807
:param dict gsm_metadata: dict of GSM metadata
1783
1808
:param str file_sra: full path to SRA.csv metafile that has to be downloaded
@@ -1805,7 +1830,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
1805
1830
acc_SRP = list (gsm_metadata .keys ())[0 ]
1806
1831
_LOGGER .warning (
1807
1832
"But the GSM has an SRX number; instead of an "
1808
- "SRP, using SRX identifier for this sample: " + acc_SRP
1833
+ f "SRP, using SRX identifier for this sample: { acc_SRP } "
1809
1834
)
1810
1835
except TypeError :
1811
1836
_LOGGER .warning ("Error in gsm_metadata" )
@@ -1839,7 +1864,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
1839
1864
return []
1840
1865
else :
1841
1866
# open existing annotation
1842
- _LOGGER .info (f "Found SRA metadata, opening.." )
1867
+ _LOGGER .info ("Found SRA metadata, opening.." )
1843
1868
with open (file_sra , "r" ) as m_file :
1844
1869
reader = csv .reader (m_file )
1845
1870
file_list = []
@@ -1869,7 +1894,7 @@ def _get_SRP_list(self, srp_number: str) -> list:
1869
1894
:return: list of dicts of SRRs
1870
1895
"""
1871
1896
if not srp_number :
1872
- _LOGGER .info (f "No srp number in this accession found" )
1897
+ _LOGGER .info ("No srp number in this accession found" )
1873
1898
return []
1874
1899
_LOGGER .info (f"Downloading { srp_number } sra metadata" )
1875
1900
ncbi_esearch = NCBI_ESEARCH .format (SRP_NUMBER = srp_number )
0 commit comments