diff --git a/orangecontrib/text/pubmed.py b/orangecontrib/text/pubmed.py index adb667f46..1a2956808 100644 --- a/orangecontrib/text/pubmed.py +++ b/orangecontrib/text/pubmed.py @@ -4,12 +4,10 @@ from datetime import datetime import numpy as np -from Bio import Entrez -from Bio import Medline +from Bio import Entrez, Medline +from Orange.data import DiscreteVariable, Domain, StringVariable, TimeVariable from Orange.misc import environ - -from Orange.data import StringVariable, DiscreteVariable, TimeVariable, Domain from orangecontrib.text.corpus import Corpus BASE_ENTRY_URL = 'http://www.ncbi.nlm.nih.gov/pubmed/?term=' @@ -185,7 +183,6 @@ def __init__(self, email, progress_callback=None, error_callback=None): self.error_callback = error_callback self.stop_signal = False - self.cache_path = None cache_folder = os.path.join(environ.cache_dir(), 'pubmedcache') if not os.path.exists(cache_folder): @@ -323,7 +320,6 @@ def _retrieve_records(self, num_records, `orangecontrib.text.corpus.Corpus`: The retrieved PubMed records as a corpus. """ - corpus = None batch_size = min(self.MAX_BATCH_SIZE, num_records) cached_data = [] # Later on, construct the corpus from this. new_records = [] # Must download. @@ -353,10 +349,8 @@ def _retrieve_records(self, num_records, # Advance the callback accordingly. self.progress_callback(int(cached_data_size/batch_size)) - # Create a starting corpus. - corpus = _corpus_from_records(cached_data, includes_metadata) - # --- Retrieve missing/new --- + records = [] if len(new_records) > 0: try: post_handle = Entrez.epost('pubmed', id=','.join(new_records)) @@ -403,18 +397,8 @@ def _retrieve_records(self, num_records, if self.progress_callback: self.progress_callback() - if corpus is None: - corpus = _corpus_from_records(records, includes_metadata) - else: # Update the corpus. - time_var = corpus.domain[PUBMED_FIELD_DATE] - meta_values, class_values = _records_to_corpus_entries( - records, - includes_metadata=includes_metadata, - time_var=time_var, - ) - - corpus.extend_corpus(meta_values, class_values) - + data = cached_data + records + corpus = _corpus_from_records(data, includes_metadata) if len(data) else None return corpus def download_records(self, terms=[], authors=[], diff --git a/orangecontrib/text/tests/data/pubmed-cache.txt b/orangecontrib/text/tests/data/pubmed-cache.txt new file mode 100644 index 000000000..cece68608 --- /dev/null +++ b/orangecontrib/text/tests/data/pubmed-cache.txt @@ -0,0 +1,4 @@ +{ +"read": {"RetMax": "533", "TranslationStack": [{"Term": "orchid[All fields]", "Field": "All fields", "Explode": "N", "Count": "1410"}, {"Term": "2011/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, {"Term": "2015/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, "RANGE", "AND"], "IdList": ["26779201", "26713612", "26666122", "26603277", "26602351"], "RetStart": "0", "WebEnv": "NCID_1_13312974_165.112.9.37_9001_1459677505_1646131365_0MetA0_S_MegaStore_F_1", "TranslationSet": [], "QueryTranslation": "orchid[All fields] AND 2011/11/02[PDAT] : 2015/11/02[PDAT]", "Count": "533", "QueryKey": "1"}, +"parse": [{"PMID": "26779201", "FAU": ["Li, Jia-Wei", "Zhang, Shi-Bao"], "TI": "Differences in the Responses of Photosystems I and II in Cymbidium sinense and C. tracyanum to Long-Term Chilling Stress.", "MH": null, "AB": "The susceptibility of photosystem I (PSI) and photosystem II (PSII) to chilling stress depends on plant species, and cyclic electron flow (CEF) plays an important role in photoprotection for some species under short stress periods. However, little is known about the responses of PSI and PSII to long-term chilling stress. We studied two orchid species-Cymbidium sinense and C. tracyanum- that differ in their capacity to adapt to low temperature, and exposed plants for 19 d to stress conditions that included 4 degrees C and a light intensity of 250 to 350 mumol photons m(-2) s(-1). Meanwhile, we investigated their dynamic variations in Chl fluorescence and P700 parameters. After exposure to 4 degrees C and 250 mumol photons m(-2) s(-1) for 6 h, PSI activity was maintained stable in both species, but stronger PSII photoinhibition was observed in C. sinense. During the long-term treatment, the maximum quantum yield of PSII was significantly reduced, with that decrease being greater in C. sinense. After 19 d of chilling treatment, the maximum photo-oxidizable P700 declined only slightly in C. tracyanum but dropped significantly in C. sinense. Linear electron flow was largely depressed during the long-term chilling treatment, especially in C. sinense. Meanwhile, C. tracyanum showed higher CEF activity than C. sinense. These results indicate that PSII is more sensitive to chilling-light stress than PSI in both species. The rate of PSII photodamage at chilling-light stress is higher in C. sinense than C. tracyanum, and CEF contributes to photoprotection for PSI and PSII under long-term chilling stress in C. tracyanum.", "DP": "2015"}, {"PMID": "26666122", "FAU": ["Dos Santos, Aline Borba", "do Nascimento, Fabio Santos"], "TI": "Cuticular Hydrocarbons of Orchid Bees Males: Interspecific and Chemotaxonomy Variation.", "MH": null, "AB": "Recent studies have investigated the composition of compounds that cover the cuticle in social insects, but few studies have focused on solitary bees. Cuticular hydrocarbons may provide a tool for chemotaxonomy, and perhaps they can be used as a complement to morphology and genetic characters in phylogenetic studies. Orchid bees (Tribe Euglossini) are a highly diverse group of Neotropical bees with more than 200 species. Here, the cuticular hydrocarbons of 17 species were identified and statistical analysis revealed 108 compounds, which allowed for the taxonomic classification according to the genera. The most significant compounds discriminating the four genera were (Z)-9-pentacosene, (Z,Z)-pentatriacontene-3, (Z)-9-tricosene, and (Z)-9-heptacosene. The analyses demonstrated the potential use of CHCs to identify different species.", "DP": "2015"}, {"PMID": "26713612", "FAU": ["Ospina-Torres, Rodulfo", "Montoya-Pfeiffer, Paula Maria", "Parra-H, Alejandro", "Solarte, Victor", "Tupac Otero, Joel"], "TI": "Interaction networks and the use of floral resources by male orchid bees (Hymenoptera: Apidae: Euglossini) in a primary rain forests of the Choco Region (Colombia).", "MH": ["Animals", "Bees/classification/*physiology", "Colombia", "*Ecosystem", "Male", "Orchidaceae/*classification", "*Pollination", "Population Density", "Rainforest"], "AB": "Orchid bees are important keystone pollinators from the Neotropics. With the aim to study the relationships between orchid bees and their nectar and aromatic host species, we made systematic samplings of males across two conservation areas in the biogeographic Choc6 Region of Colombia. We used chemical baits to collect 352 male bees during five months. The pollen attached to their bodies was extracted for palynological identification and to estimate interaction networks. The euglossine community consisted of at least 22 species including Eg. maculilabris, Eg. orellana, Eg. championi and Eg. ignita. The male bees were associated with 84 plants but depended on a small group of them (Peperomia spp. and Anthurium spp, as well as species of Solanaceae, Ericaceae and Malpighiaceae) which were widely distributed across the altitudinal gradient, and were available through the year. The resulting interaction networks revealed a typical nested pattern usually found in plant-pollinator interactions, with several rare bee and plant species interaction with a small group of generalist bees and plant species. Albeit, we found variation within networks related to species composition. Such variation may be a consequence of specific differences in plant flowering phenology.", "DP": "2015 Sep"}, {"PMID": "26603277", "FAU": ["Nielsen, Lasse Janniche", "Moller, Birger Lindberg"], "TI": "Scent emission profiles from Darwin's orchid--Angraecum sesquipedale: Investigation of the aldoxime metabolism using clustering analysis.", "MH": null, "AB": "The display of scent is crucial for plants in attracting pollinating insects to flowers and ensuring successful pollination and reproduction. The large number of aldoxime volatile species present in the scent of the Madagascan orchid Angraecum sesquipedale has been suggested to play a primary role in attracting the sphingid moth Xanthopan morgani praedicta. By solid phase micro-extraction (SPME) coupled with gas chromatography-mass spectrometry (GC-MS), we monitored the scent release from different flowers of a single orchid, day and night throughout the entire flowering period. In separate experiments, the diurnal release was monitored in 3h intervals and the tissue specific release from the different floral parts was tracked. Numerous novel compounds related to the aldoxime metabolism not previously detected in A. sesquipedale were identified and positioned into a proposed pathway for aldoxime metabolism. From the results, we hypothesize that (E/Z)-phenylacetaldoxime and its derivatives could be important attractants for the pollinating moth X. morgani praedicta. By applying an untargeted Partitioning Around Medoids (PAM) cluster analysis to the metabolite profiles in the scent, the proposed pathways for the formation of aldoximes were substantiated. With this study, we demonstrate the powerful utility of a bioinformatics tool to aid in the elucidation of the routes of formation for volatiles and provide a benchmark and guidelines for future detailed observations of hawkmoth pollination of Angraecum species, and in particular A. sesquipedale, in the wild.", "DP": "2015 Dec"}, {"PMID": "26602351", "FAU": ["Oliveira, R", "Pinto, C E", "Schlindwein, C"], "TI": "Two common species dominate the species-rich Euglossine bee fauna of an Atlantic Rainforest remnant in Pernambuco, Brazil.", "MH": null, "AB": "Nowadays, the northern part of the Atlantic Rainforest of Brazil is largely destroyed and forest remnants rarely exceed 100 ha. In a 118 ha forest fragment within a state nature reserve of Pernambuco (Reserva Ecologica Gurjau), we surveyed the orchid bee fauna (Apidae, Euglossini) using eight different scent baits to attract males. Once a month during one year, the bees were actively collected with entomological nets, from November 2002 to October 2003 by two collectors. We collected 2,908 orchid bee males belonging to 23 species, one of the highest richness values of the Northern Atlantic Rainforest. Bees of only two species, Euglossa carolina (50%) and Eulaema nigrita (25%), which occurred throughout the year, accounted for three quarter of the collected individuals. Both species are typical for open or disturbed areas. Rainforest remnants like those of Gurjau within the predominant sugar cane monocultures in the coastal plains of the northern Atlantic Rainforest play an important role in orchid bee conservation and maintenance of biodiversity.", "DP": "2015 Nov"}] +} diff --git a/orangecontrib/text/tests/pubmed-cache.txt b/orangecontrib/text/tests/pubmed-cache.txt deleted file mode 100644 index 4871c77f2..000000000 --- a/orangecontrib/text/tests/pubmed-cache.txt +++ /dev/null @@ -1,4 +0,0 @@ -{ -"read": {"RetMax": "533", "TranslationStack": [{"Term": "orchid[All fields]", "Field": "All fields", "Explode": "N", "Count": "1410"}, {"Term": "2011/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, {"Term": "2015/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, "RANGE", "AND"], "IdList": ["26779201", "26713612", "26666122", "26603277", "26602351", "26580566", "26574675", "26571020", "26567856", "26567854", "26567851", "26558895", "26555336", "26523377", "26513609", "26505352", "26493518", "26493226", "26481137", "26481007", "26471909", "26452559", "26434147", "26423960", "26416561", "26387873", "26380706", "26372504", "26351150", "26335564", "26314297", "26314032", "26311710", "26311671", "26303983", "26299131", "26286221", "26271118", "26261398", "26260631", "26251157", "26249081", "26244769", "26236906", "26236866", "26224027", "26209365", "26206372", "26205509", "26183369", "26179361", "26162896", "26149997", "26149746", "26140205", "26134675", "26131375", "26125940", "26113634", "26105186", "26105185", "26082878", "26081278", "26079670", "26069956", "26063938", "26054613", "26046143", "26029428", "26025156", "26024358", "26006185", "25987618", "25970572", "25963669", "25959442", "25953040", "25947720", "25943771", "25941020", "25929591", "25917508", "25916981", "25902264", "25902058", "25900746", "25893148", "25886817", "25884020", "25865497", "25861687", "25861675", "25847454", "25844242", "25827410", "25825286", "25821245", "25814059", "25810660", "25801274", "25789487", "25784472", "25771863", "25771507", "25761566", "25756994", "25725112", "25711871", "25706625", "25704464", "25691974", "25678071", "25670973", "25652914", "25652831", "25639293", "25627384", "25627369", "25627364", "25619237", "25614926", "25612936", "25606433", "25600727", "25600397", "25590685", "25587149", "25582733", "25578271", "25567572", "25547596", "25546739", "25546318", "25538109", "25535483", "25526190", "25522604", "25505907", "25501842", "25491556", "25483791", "25482818", "25482794", "25482758", "25481640", "25472757", "25465335", "25463417", "25456430", "25454786", "25451646", "25450442", "25442280", "25438783", "25434107", "25430922", "25423071", "25422945", "25422941", "25421469", "25420146", "25401154", "25397675", "25382492", "25382295", "25380694", "25377920", "25370335", "25365177", "25350549", "25345817", "25320212", "25319065", "25311664", "25309222", "25294871", "25289772", "25284941", "25282501", "25278267", "25270669", "25266169", "25255853", "25250382", "25239559", "25236982", "25233643", "25221736", "25202630", "25202591", "25195836", "25183255", "25174959", "25168766", "25153202", "25141298", "25140872", "25122654", "25117357", "25102662", "25093401", "25089246", "25082140", "25078600", "25068598", "25060609", "25055082", "25034728", "25033767", "25025767", "25006180", "24998243", "24983476", "24980395", "24977213", "24974386", "24962394", "24961116", "24959588", "24952917", "24916060", "24913627", "24911363", "24907253", "24889383", "24845051", "24844664", "24832004", "24830683", "24830247", "24818583", "24817196", "24811734", "24800839", "24792982", "24777596", "24772822", "24772371", "24760407", "24752613", "24747128", "24747003", "24714568", "24697806", "24688055", "24659990", "24659825", "24641918", "24641728", "24635099", "24628585", "24621377", "24617632", "24615110", "24606694", "24571782", "24563211", "24550584", "24532077", "24527666", "24525191", "24520914", "24506021", "24498329", "24494717", "24482765", "24472146", "24471784", "24460947", "24454052", "24444001", "24409313", "24402568", "24392284", "24392013", "26462695", "24370475", "24366109", "24334741", "24325257", "24315348", "24310930", "24310615", "24308648", "24267156", "24265826", "24252216", "24249491", "24237204", "24223974", "24222213", "24218184", "24212691", "24212690", "24204832", "24173913", "24171312", "24169595", "24169591", "24147137", "24146569", "24146369", "24143222", "24136821", "24129175", "24112555", "24108313", "24107684", "24107683", "24106908", "24106667", "24090143", "24081823", "24074296", "24070862", "24065980", "24065668", "24052555", "24026354", "24009198", "24004516", "24001513", "23997231", "23984533", "23970076", "23969851", "23967332", "23956416", "23950085", "23950065", "23949981", "23949668", "23943081", "23939470", "23923483", "23917792", "23917575", "23917565", "23917564", "23917561", "23904568", "23894934", "23887919", "23857506", "23852996", "23841860", "23836678", "23833951", "23815661", "23815657", "23814439", "23812655", "23804617", "23795085", "23795032", "23780616", "23755534", "23751373", "23750181", "23738332", "23734209", "23701535", "23697165", "23692750", "23675685", "23647016", "23640259", "23638361", "23633002", "23618898", "23617896", "23616651", "23599555", "23597078", "23595103", "23593204", "23577161", "23577083", "23575662", "23563702", "23553724", "23545217", "23541634", "25202536", "23538115", "23532045", "23532043", "23526253", "23517677", "23504932", "23460979", "23448889", "25202528", "23440864", "23440505", "23435707", "23432406", "23417646", "23396600", "23396183", "23387843", "23387091", "23380059", "23359109", "23352400", "23347062", "23347020", "23331669", "23330773", "23324169", "23323053", "23315811", "23314755", "23308277", "23292456", "23277396", "23275632", "25366049", "25110576", "24699618", "23271631", "23262337", "23254458", "23251681", "23250404", "23249619", "23228193", "26255391", "23185288", "23136638", "23132617", "23129986", "23129981", "23125404", "23118724", "23117571", "23115134", "23091095", "23089850", "23070138", "23057699", "23055065", "23043621", "23032816", "23025596", "23014841", "23014813", "23008809", "23000275", "22997547", "22991932", "22988976", "22988937", "22967086", "22962357", "22961111", "22957702", "22945851", "22935364", "22928416", "22916031", "22904110", "22870305", "22851311", "22848645", "22822353", "22805697", "22805528", "22805274", "22805176", "22798654", "22796899", "22778148", "22775550", "22771852", "22765763", "22761711", "22753812", "22749731", "22735344", "22706647", "22692268", "22691199", "22688426", "22687369", "22685605", "22655861", "22649529", "22623495", "22571550", "22563130", "22554451", "22547659", "22519778", "22499266", "22496851", "22483052", "22418255", "22415688", "22408409", "22397405", "22391855", "22375900", "22367365", "22352154", "22350407", "22307645", "22303114", "22301129", "22298842", "22289766", "22275769", "22272942", "22268629", "22253763", "22228668", "22208094", "22203651", "22194849", "22128845", "22106437", "22106436", "22082802", "22079545", "22049092", "21995447", "21972891", "21925852", "21792224", "21779810", "21739239", "23921401", "23195932"], "RetStart": "0", "WebEnv": "NCID_1_13312974_165.112.9.37_9001_1459677505_1646131365_0MetA0_S_MegaStore_F_1", "TranslationSet": [], "QueryTranslation": "orchid[All fields] AND 2011/11/02[PDAT] : 2015/11/02[PDAT]", "Count": "533", "QueryKey": "1"}, -"parse": [{"FAU": ["Li, Jia-Wei", "Zhang, Shi-Bao"], "TI": "Differences in the Responses of Photosystems I and II in Cymbidium sinense and C. tracyanum to Long-Term Chilling Stress.", "MH": null, "AB": "The susceptibility of photosystem I (PSI) and photosystem II (PSII) to chilling stress depends on plant species, and cyclic electron flow (CEF) plays an important role in photoprotection for some species under short stress periods. However, little is known about the responses of PSI and PSII to long-term chilling stress. We studied two orchid species-Cymbidium sinense and C. tracyanum- that differ in their capacity to adapt to low temperature, and exposed plants for 19 d to stress conditions that included 4 degrees C and a light intensity of 250 to 350 mumol photons m(-2) s(-1). Meanwhile, we investigated their dynamic variations in Chl fluorescence and P700 parameters. After exposure to 4 degrees C and 250 mumol photons m(-2) s(-1) for 6 h, PSI activity was maintained stable in both species, but stronger PSII photoinhibition was observed in C. sinense. During the long-term treatment, the maximum quantum yield of PSII was significantly reduced, with that decrease being greater in C. sinense. After 19 d of chilling treatment, the maximum photo-oxidizable P700 declined only slightly in C. tracyanum but dropped significantly in C. sinense. Linear electron flow was largely depressed during the long-term chilling treatment, especially in C. sinense. Meanwhile, C. tracyanum showed higher CEF activity than C. sinense. These results indicate that PSII is more sensitive to chilling-light stress than PSI in both species. The rate of PSII photodamage at chilling-light stress is higher in C. sinense than C. tracyanum, and CEF contributes to photoprotection for PSI and PSII under long-term chilling stress in C. tracyanum.", "DP": "2015"}, {"FAU": ["Dos Santos, Aline Borba", "do Nascimento, Fabio Santos"], "TI": "Cuticular Hydrocarbons of Orchid Bees Males: Interspecific and Chemotaxonomy Variation.", "MH": null, "AB": "Recent studies have investigated the composition of compounds that cover the cuticle in social insects, but few studies have focused on solitary bees. Cuticular hydrocarbons may provide a tool for chemotaxonomy, and perhaps they can be used as a complement to morphology and genetic characters in phylogenetic studies. Orchid bees (Tribe Euglossini) are a highly diverse group of Neotropical bees with more than 200 species. Here, the cuticular hydrocarbons of 17 species were identified and statistical analysis revealed 108 compounds, which allowed for the taxonomic classification according to the genera. The most significant compounds discriminating the four genera were (Z)-9-pentacosene, (Z,Z)-pentatriacontene-3, (Z)-9-tricosene, and (Z)-9-heptacosene. The analyses demonstrated the potential use of CHCs to identify different species.", "DP": "2015"}, {"FAU": ["Ospina-Torres, Rodulfo", "Montoya-Pfeiffer, Paula Maria", "Parra-H, Alejandro", "Solarte, Victor", "Tupac Otero, Joel"], "TI": "Interaction networks and the use of floral resources by male orchid bees (Hymenoptera: Apidae: Euglossini) in a primary rain forests of the Choco Region (Colombia).", "MH": ["Animals", "Bees/classification/*physiology", "Colombia", "*Ecosystem", "Male", "Orchidaceae/*classification", "*Pollination", "Population Density", "Rainforest"], "AB": "Orchid bees are important keystone pollinators from the Neotropics. With the aim to study the relationships between orchid bees and their nectar and aromatic host species, we made systematic samplings of males across two conservation areas in the biogeographic Choc6 Region of Colombia. We used chemical baits to collect 352 male bees during five months. The pollen attached to their bodies was extracted for palynological identification and to estimate interaction networks. The euglossine community consisted of at least 22 species including Eg. maculilabris, Eg. orellana, Eg. championi and Eg. ignita. The male bees were associated with 84 plants but depended on a small group of them (Peperomia spp. and Anthurium spp, as well as species of Solanaceae, Ericaceae and Malpighiaceae) which were widely distributed across the altitudinal gradient, and were available through the year. The resulting interaction networks revealed a typical nested pattern usually found in plant-pollinator interactions, with several rare bee and plant species interaction with a small group of generalist bees and plant species. Albeit, we found variation within networks related to species composition. Such variation may be a consequence of specific differences in plant flowering phenology.", "DP": "2015 Sep"}, {"FAU": ["Nielsen, Lasse Janniche", "Moller, Birger Lindberg"], "TI": "Scent emission profiles from Darwin's orchid--Angraecum sesquipedale: Investigation of the aldoxime metabolism using clustering analysis.", "MH": null, "AB": "The display of scent is crucial for plants in attracting pollinating insects to flowers and ensuring successful pollination and reproduction. The large number of aldoxime volatile species present in the scent of the Madagascan orchid Angraecum sesquipedale has been suggested to play a primary role in attracting the sphingid moth Xanthopan morgani praedicta. By solid phase micro-extraction (SPME) coupled with gas chromatography-mass spectrometry (GC-MS), we monitored the scent release from different flowers of a single orchid, day and night throughout the entire flowering period. In separate experiments, the diurnal release was monitored in 3h intervals and the tissue specific release from the different floral parts was tracked. Numerous novel compounds related to the aldoxime metabolism not previously detected in A. sesquipedale were identified and positioned into a proposed pathway for aldoxime metabolism. From the results, we hypothesize that (E/Z)-phenylacetaldoxime and its derivatives could be important attractants for the pollinating moth X. morgani praedicta. By applying an untargeted Partitioning Around Medoids (PAM) cluster analysis to the metabolite profiles in the scent, the proposed pathways for the formation of aldoximes were substantiated. With this study, we demonstrate the powerful utility of a bioinformatics tool to aid in the elucidation of the routes of formation for volatiles and provide a benchmark and guidelines for future detailed observations of hawkmoth pollination of Angraecum species, and in particular A. sesquipedale, in the wild.", "DP": "2015 Dec"}, {"FAU": ["Oliveira, R", "Pinto, C E", "Schlindwein, C"], "TI": "Two common species dominate the species-rich Euglossine bee fauna of an Atlantic Rainforest remnant in Pernambuco, Brazil.", "MH": null, "AB": "Nowadays, the northern part of the Atlantic Rainforest of Brazil is largely destroyed and forest remnants rarely exceed 100 ha. In a 118 ha forest fragment within a state nature reserve of Pernambuco (Reserva Ecologica Gurjau), we surveyed the orchid bee fauna (Apidae, Euglossini) using eight different scent baits to attract males. Once a month during one year, the bees were actively collected with entomological nets, from November 2002 to October 2003 by two collectors. We collected 2,908 orchid bee males belonging to 23 species, one of the highest richness values of the Northern Atlantic Rainforest. Bees of only two species, Euglossa carolina (50%) and Eulaema nigrita (25%), which occurred throughout the year, accounted for three quarter of the collected individuals. Both species are typical for open or disturbed areas. Rainforest remnants like those of Gurjau within the predominant sugar cane monocultures in the coastal plains of the northern Atlantic Rainforest play an important role in orchid bee conservation and maintenance of biodiversity.", "DP": "2015 Nov"}] -} diff --git a/orangecontrib/text/tests/test_pubmed.py b/orangecontrib/text/tests/test_pubmed.py index 5447bda8b..841b7fbbd 100644 --- a/orangecontrib/text/tests/test_pubmed.py +++ b/orangecontrib/text/tests/test_pubmed.py @@ -6,10 +6,13 @@ import numpy as np from orangecontrib.text.pubmed import ( - Pubmed, PUBMED_TEXT_FIELDS, PUBMED_FIELD_DATE, + Pubmed, + PUBMED_TEXT_FIELDS, + PUBMED_FIELD_DATE, _mesh_headings_to_class, - _date_to_iso, _corpus_from_records, - _records_to_corpus_entries + _date_to_iso, + _corpus_from_records, + _records_to_corpus_entries, ) @@ -27,23 +30,25 @@ class MockEntrez: def __init__(self, cache): self.mock_responses = {} - with open(cache, 'r') as f: + with open(cache, "r") as f: self.mock_responses = json.loads(f.read()) + self.list = None def esearch(self, db, term, **keywds): return MockEntrezHandle() def read(self, handle): - return self.mock_responses.get('read') + return self.mock_responses.get("read") def efetch(self, db, **keywords): return MockEntrezHandle() def epost(self, db, **keywds): + self.list = keywds["id"].split(",") return MockEntrezHandle() def parse(self, handle): - return self.mock_responses.get('parse') + return [x for x in self.mock_responses.get("parse") if x["PMID"] in self.list] # Exception mocking. def esearch_exception(self, db, term, **keywds): @@ -56,7 +61,7 @@ def epost_exception(self, db, **keywds): raise IOError -CACHE = os.path.join(os.path.dirname(__file__), 'pubmed-cache.txt') +CACHE = os.path.join(os.path.dirname(__file__), "data", "pubmed-cache.txt") mock_entrez = MockEntrez(CACHE) @@ -69,101 +74,106 @@ def progress_callback(progress=None): class PubmedTests(unittest.TestCase): - EMAIL = 'mockemail@mockdomain.com' + EMAIL = "mockemail@mockdomain.com" def setUp(self): self.pubmed = Pubmed( - self.EMAIL, - progress_callback=progress_callback, - error_callback=error_callback + self.EMAIL, + progress_callback=progress_callback, + error_callback=error_callback, ) + self.__empty_cache() + + def tearDown(self): + self.__empty_cache() + + def __empty_cache(self): + try: + os.remove(self.pubmed.cache_path + ".db") + except OSError: + pass def test_mesh_headings_to_class(self): input_headings = [ - 'heading1 & heading2/heading3,heading4/*heading5', - 'heading1/heading2/*heading3', + "heading1 & heading2/heading3,heading4/*heading5", + "heading1/heading2/*heading3", ] - self.assertEqual(_mesh_headings_to_class(input_headings), 'heading1') + self.assertEqual(_mesh_headings_to_class(input_headings), "heading1") def test_date_to_iso(self): # Correct inputs. input_dates = [ - '2015 Nov', - '2015', - '2015 Sep-Oct', - '2015 Fall', + "2015 Nov", + "2015", + "2015 Sep-Oct", + "2015 Fall", ] correct_results = [ - '2015-11-01', - '2015-01-01', - '2015-09-01', - '2015-09-01', + "2015-11-01", + "2015-01-01", + "2015-09-01", + "2015-09-01", ] for date, result in zip(input_dates, correct_results): self.assertEqual(_date_to_iso(date), result) # Unexpected inputs. - unexpected_input = '2015 Unexpected' - self.assertWarns( - RuntimeWarning, - _date_to_iso, - unexpected_input, - ) + unexpected_input = "2015 Unexpected" + self.assertWarns(RuntimeWarning, _date_to_iso, unexpected_input) self.assertEqual(type(_date_to_iso(unexpected_input)), type(np.nan)) def test_record_to_corpus(self): mock_records = [ { - 'FAU': ['Mock Author 1', 'Mock Author 2'], - 'TI': 'Mock title', - 'MH': ['heading1/heading2'], - 'AB': 'Mock abstract', - 'DP': '2015 Sep', - 'PMID': 1, + "FAU": ["Mock Author 1", "Mock Author 2"], + "TI": "Mock title", + "MH": ["heading1/heading2"], + "AB": "Mock abstract", + "DP": "2015 Sep", + "PMID": 1, }, ] - correct_metas = np.array([ + correct_metas = np.array( [ - 'Mock Author 1 Mock Author 2', - 'Mock title', - 'heading1/heading2', - 'Mock abstract', - 'http://www.ncbi.nlm.nih.gov/pubmed/?term=1', - 1441065600.0 - ] - ], dtype=object) - correct_classes = np.array([ - 'heading1' - ]) + [ + "Mock Author 1 Mock Author 2", + "Mock title", + "heading1/heading2", + "Mock abstract", + "http://www.ncbi.nlm.nih.gov/pubmed/?term=1", + 1441065600.0, + ] + ], + dtype=object, + ) + correct_classes = np.array(["heading1"]) # Perform asserting. corpus = _corpus_from_records(mock_records, PUBMED_TEXT_FIELDS) meta_values, class_values = _records_to_corpus_entries( - mock_records, - PUBMED_TEXT_FIELDS, - corpus.domain[PUBMED_FIELD_DATE] + mock_records, PUBMED_TEXT_FIELDS, corpus.domain[PUBMED_FIELD_DATE] ) self.assertCountEqual(meta_values[0], correct_metas[0]) self.assertCountEqual(class_values, correct_classes) self.assertIsNotNone(corpus) self.assertEqual(corpus.language, "en") - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) def test_pubmed_search_records(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" self.pubmed._search_for_records( - terms=test_terms, - authors=authors, - pub_date_start=pub_date_start, - pub_date_end=pub_date_end + terms=test_terms, + authors=authors, + pub_date_start=pub_date_start, + pub_date_end=pub_date_end, ) # The only certain check is to make sure we got all the parameters. @@ -174,157 +184,154 @@ def test_pubmed_search_records(self): # Faulty input check. self.assertRaises( - ValueError, - self.pubmed._search_for_records, - terms=test_terms, - authors=None, - pub_date_start=pub_date_start, - pub_date_end=pub_date_end + ValueError, + self.pubmed._search_for_records, + terms=test_terms, + authors=None, + pub_date_start=pub_date_start, + pub_date_end=pub_date_end, ) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch) - @patch('Bio.Medline.parse', mock_entrez.parse) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) def test_pubmed_retrieve_record_batch(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" offset = 0 num_requested_records = 5 # Attempt to retrieve without searching first. self.assertRaises( - ValueError, - self.pubmed._retrieve_record_batch, - offset, - num_requested_records + ValueError, + self.pubmed._retrieve_record_batch, + offset, + num_requested_records, ) # Must search for records first. self.pubmed._search_for_records( - test_terms, - authors, - pub_date_start, - pub_date_end + test_terms, authors, pub_date_start, pub_date_end ) # Retrieve the records. - data = self.pubmed._retrieve_record_batch( - offset, - num_requested_records - ) + data = self.pubmed._retrieve_record_batch(offset, num_requested_records) self.assertEqual(len(data), num_requested_records) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch) - @patch('Bio.Medline.parse', mock_entrez.parse) - @patch('Bio.Entrez.epost', mock_entrez.epost) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost) def test_pubmed_retrieve_records(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" num_records = 5 # Must search for records first. self.pubmed._search_for_records( - test_terms, - authors, - pub_date_start, - pub_date_end + test_terms, authors, pub_date_start, pub_date_end ) # Retrieve the records and build a corpus. corpus = self.pubmed._retrieve_records(num_records) self.assertEqual(len(corpus), num_records) - meta_fields = sorted([field_name - for field_name, field_tag - in PUBMED_TEXT_FIELDS]) - test_meta_fields = sorted([m.name - for m - in corpus.domain.metas]) + meta_fields = sorted( + [field_name for field_name, field_tag in PUBMED_TEXT_FIELDS] + ) + test_meta_fields = sorted([m.name for m in corpus.domain.metas]) self.assertEqual(meta_fields, test_meta_fields) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch) - @patch('Bio.Medline.parse', mock_entrez.parse) - @patch('Bio.Entrez.epost', mock_entrez.epost) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost) + def test_pubmed_retrieve_records_cache_and_new_records(self): + """Test retrieving combination of cached and new instances""" + test_terms = ["orchid"] + authors = [] + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" + + # Must search for records first. + self.pubmed._search_for_records( + test_terms, authors, pub_date_start, pub_date_end + ) + + # retrieve records - they also get cached + self.assertEqual(len(self.pubmed._retrieve_records(3)), 3) + # retrieve more - three are cached and two are new + self.assertEqual(len(self.pubmed._retrieve_records(5)), 5) + + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost) def test_pubmed_retrieve_records_no_cache(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" num_records = 5 # Must search for records first. self.pubmed._search_for_records( - test_terms, - authors, - pub_date_start, - pub_date_end + test_terms, authors, pub_date_start, pub_date_end ) # Retrieve the records and build a corpus. - corpus = self.pubmed._retrieve_records( - num_records, - use_cache=False - ) + corpus = self.pubmed._retrieve_records(num_records, use_cache=False) self.assertEqual(len(corpus), num_records) - meta_fields = sorted([field_name - for field_name, field_tag - in PUBMED_TEXT_FIELDS]) - test_meta_fields = sorted([m.name - for m - in corpus.domain.metas]) + meta_fields = sorted( + [field_name for field_name, field_tag in PUBMED_TEXT_FIELDS] + ) + test_meta_fields = sorted([m.name for m in corpus.domain.metas]) self.assertEqual(meta_fields, test_meta_fields) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch) - @patch('Bio.Medline.parse', mock_entrez.parse) - @patch('Bio.Entrez.epost', mock_entrez.epost) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost) def test_download_records(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" num_records = 5 # Retrieve the records and build a corpus. corpus = self.pubmed.download_records( - test_terms, - authors, - pub_date_start, - pub_date_end, - num_records + test_terms, authors, pub_date_start, pub_date_end, num_records ) self.assertEqual(len(corpus), num_records) - meta_fields = sorted([field_name - for field_name, field_tag - in PUBMED_TEXT_FIELDS]) - test_meta_fields = sorted([m.name - for m - in corpus.domain.metas]) + meta_fields = sorted( + [field_name for field_name, field_tag in PUBMED_TEXT_FIELDS] + ) + test_meta_fields = sorted([m.name for m in corpus.domain.metas]) self.assertEqual(meta_fields, test_meta_fields) - @patch('Bio.Entrez.esearch', mock_entrez.esearch_exception) + @patch("Bio.Entrez.esearch", mock_entrez.esearch_exception) def test_entrez_search_exceptions(self): # Search exception. - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" self.assertWarns( RuntimeWarning, @@ -332,63 +339,51 @@ def test_entrez_search_exceptions(self): terms=test_terms, authors=authors, pub_date_start=pub_date_start, - pub_date_end=pub_date_end + pub_date_end=pub_date_end, ) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch_exception) - @patch('Bio.Medline.parse', mock_entrez.parse) - @patch('Bio.Entrez.epost', mock_entrez.epost) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch_exception) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost) def test_pubmed_retrieve_record_batch_exception(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" num_records = 5 # Must search for records first. self.pubmed._search_for_records( - test_terms, - authors, - pub_date_start, - pub_date_end + test_terms, authors, pub_date_start, pub_date_end ) self.assertWarns( - RuntimeWarning, - self.pubmed._retrieve_records, - num_records, - use_cache=False + RuntimeWarning, self.pubmed._retrieve_records, num_records, use_cache=False ) - @patch('Bio.Entrez.esearch', mock_entrez.esearch) - @patch('Bio.Entrez.read', mock_entrez.read) - @patch('Bio.Entrez.efetch', mock_entrez.efetch) - @patch('Bio.Medline.parse', mock_entrez.parse) - @patch('Bio.Entrez.epost', mock_entrez.epost_exception) + @patch("Bio.Entrez.esearch", mock_entrez.esearch) + @patch("Bio.Entrez.read", mock_entrez.read) + @patch("Bio.Entrez.efetch", mock_entrez.efetch) + @patch("Bio.Medline.parse", mock_entrez.parse) + @patch("Bio.Entrez.epost", mock_entrez.epost_exception) def test_pubmed_epost_exception(self): - test_terms = ['orchid'] + test_terms = ["orchid"] authors = [] - pub_date_start = '2011/07/07' - pub_date_end = '2014/07/07' + pub_date_start = "2011/07/07" + pub_date_end = "2014/07/07" num_records = 5 # Must search for records first. self.pubmed._search_for_records( - test_terms, - authors, - pub_date_start, - pub_date_end + test_terms, authors, pub_date_start, pub_date_end ) self.assertWarns( - RuntimeWarning, - self.pubmed._retrieve_records, - num_records, - use_cache=False + RuntimeWarning, self.pubmed._retrieve_records, num_records, use_cache=False ) diff --git a/orangecontrib/text/widgets/owpubmed.py b/orangecontrib/text/widgets/owpubmed.py index 206a632dd..87bec1ed1 100644 --- a/orangecontrib/text/widgets/owpubmed.py +++ b/orangecontrib/text/widgets/owpubmed.py @@ -3,18 +3,24 @@ from datetime import date from AnyQt.QtCore import QDate, Qt -from AnyQt.QtWidgets import (QApplication, QComboBox, QDateEdit, QTextEdit, - QFrame, QDialog, QCalendarWidget, QVBoxLayout, - QFormLayout) - +from AnyQt.QtWidgets import ( + QCalendarWidget, + QComboBox, + QDateEdit, + QDialog, + QFormLayout, + QFrame, + QTextEdit, + QVBoxLayout, +) from Orange.widgets import gui from Orange.widgets.credentials import CredentialManager from Orange.widgets.settings import Setting -from Orange.widgets.widget import OWWidget, Msg +from Orange.widgets.widget import Msg, OWWidget +from orangewidget.utils.signals import Output + from orangecontrib.text.corpus import Corpus -from orangecontrib.text.pubmed import ( - Pubmed, PUBMED_TEXT_FIELDS -) +from orangecontrib.text.pubmed import PUBMED_TEXT_FIELDS, Pubmed def _i(name, icon_path='icons'): @@ -29,10 +35,6 @@ def validate_email(email): return EMAIL_REGEX.match(email) -class Output: - CORPUS = 'Corpus' - - class OWPubmed(OWWidget): class EmailCredentialsDialog(OWWidget): name = "Pubmed Email" @@ -92,7 +94,6 @@ def accept(self, silent=False): icon = 'icons/Pubmed.svg' priority = 140 - outputs = [(Output.CORPUS, Corpus)] want_main_area = False resizing_enabled = False @@ -117,6 +118,9 @@ def accept(self, silent=False): email = None + class Outputs: + corpus = Output("Corpus", Corpus) + class Warning(OWWidget.Warning): no_query = Msg('Please specify the keywords for this query.') @@ -401,7 +405,7 @@ def retrieve_records(self): self.retrieve_records_button.setText('Retrieve records') self.download_running = False - self.send(Output.CORPUS, self.output_corpus) + self.Outputs.corpus.send(self.output_corpus) self.update_retrieval_info() self.run_search_button.setEnabled(True) @@ -531,7 +535,6 @@ def set_date(self, date): if __name__ == '__main__': - app = QApplication([]) - widget = OWPubmed() - widget.show() - app.exec() + from orangewidget.utils.widgetpreview import WidgetPreview + + WidgetPreview(OWPubmed).run()