From 8866a8f324e141b839fb33ace8d348256d2f4b96 Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Thu, 18 Jul 2024 11:33:37 -0400 Subject: [PATCH 1/4] update pypi package long description to always track README --- setup.py | 111 ++----------------------------------------------------- 1 file changed, 3 insertions(+), 108 deletions(-) diff --git a/setup.py b/setup.py index be7f58330b..aab9dff06e 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ import shutil import sys from collections import OrderedDict +from pathlib import Path from setuptools import Extension, find_packages, setup, distutils from setuptools.command.build_ext import build_ext @@ -158,112 +159,6 @@ def run(self): cmdclass.update(vars(wheelhouse_uploader.cmd)) -LONG_DESCRIPTION = u""" -============================================== -gensim -- Topic Modelling in Python -============================================== - -|GA|_ -|Wheel|_ - -.. |GA| image:: https://github.com/RaRe-Technologies/gensim/actions/workflows/tests.yml/badge.svg?branch=develop -.. |Wheel| image:: https://img.shields.io/pypi/wheel/gensim.svg - -.. _GA: https://github.com/RaRe-Technologies/gensim/actions -.. _Downloads: https://pypi.org/project/gensim/ -.. _License: https://radimrehurek.com/gensim/intro.html#licensing -.. _Wheel: https://pypi.org/project/gensim/ - -Gensim is a Python library for *topic modelling*, *document indexing* and *similarity retrieval* with large corpora. -Target audience is the *natural language processing* (NLP) and *information retrieval* (IR) community. - -Features ---------- - -* All algorithms are **memory-independent** w.r.t. the corpus size (can process input larger than RAM, streamed, out-of-core) -* **Intuitive interfaces** - - * easy to plug in your own input corpus/datastream (simple streaming API) - * easy to extend with other Vector Space algorithms (simple transformation API) - -* Efficient multicore implementations of popular algorithms, such as online **Latent Semantic Analysis (LSA/LSI/SVD)**, - **Latent Dirichlet Allocation (LDA)**, **Random Projections (RP)**, **Hierarchical Dirichlet Process (HDP)** or **word2vec deep learning**. -* **Distributed computing**: can run *Latent Semantic Analysis* and *Latent Dirichlet Allocation* on a cluster of computers. -* Extensive `documentation and Jupyter Notebook tutorials `_. - - -If this feature list left you scratching your head, you can first read more about the `Vector -Space Model `_ and `unsupervised -document analysis `_ on Wikipedia. - -Installation ------------- - -This software depends on `NumPy and Scipy `_, two Python packages for scientific computing. -You must have them installed prior to installing `gensim`. - -It is also recommended you install a fast BLAS library before installing NumPy. This is optional, but using an optimized BLAS such as MKL, `ATLAS `_ or `OpenBLAS `_ is known to improve performance by as much as an order of magnitude. On OSX, NumPy picks up its vecLib BLAS automatically, so you don't need to do anything special. - -Install the latest version of gensim:: - - pip install --upgrade gensim - -Or, if you have instead downloaded and unzipped the `source tar.gz `_ package:: - - python setup.py install - - -For alternative modes of installation, see the `documentation `_. - -Gensim is being `continuously tested `_ under all `supported Python versions `_. -Support for Python 2.7 was dropped in gensim 4.0.0 – install gensim 3.8.3 if you must use Python 2.7. - - -How come gensim is so fast and memory efficient? Isn't it pure Python, and isn't Python slow and greedy? --------------------------------------------------------------------------------------------------------- - -Many scientific algorithms can be expressed in terms of large matrix operations (see the BLAS note above). Gensim taps into these low-level BLAS libraries, by means of its dependency on NumPy. So while gensim-the-top-level-code is pure Python, it actually executes highly optimized Fortran/C under the hood, including multithreading (if your BLAS is so configured). - -Memory-wise, gensim makes heavy use of Python's built-in generators and iterators for streamed data processing. Memory efficiency was one of gensim's `design goals `_, and is a central feature of gensim, rather than something bolted on as an afterthought. - -Documentation -------------- -* `QuickStart`_ -* `Tutorials`_ -* `Tutorial Videos`_ -* `Official Documentation and Walkthrough`_ - -Citing gensim -------------- - -When `citing gensim in academic papers and theses `_, please use this BibTeX entry:: - - @inproceedings{rehurek_lrec, - title = {{Software Framework for Topic Modelling with Large Corpora}}, - author = {Radim {\\v R}eh{\\r u}{\\v r}ek and Petr Sojka}, - booktitle = {{Proceedings of the LREC 2010 Workshop on New - Challenges for NLP Frameworks}}, - pages = {45--50}, - year = 2010, - month = May, - day = 22, - publisher = {ELRA}, - address = {Valletta, Malta}, - language={English} - } - ----------------- - -Gensim is open source software released under the `GNU LGPLv2.1 license `_. -Copyright (c) 2009-now Radim Rehurek - -.. _Official Documentation and Walkthrough: https://radimrehurek.com/gensim/ -.. _Tutorials: https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials -.. _Tutorial Videos: https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#videos -.. _QuickStart: https://radimrehurek.com/gensim/gensim_numfocus/auto_examples/core/run_core_concepts.html - -""" - distributed_env = ['Pyro4 >= 4.27'] visdom_req = ['visdom >= 0.1.8, != 0.1.8.7'] @@ -339,8 +234,8 @@ def run(self): name='gensim', version='4.3.2.dev0', description='Python framework for fast Vector Space Modelling', - long_description=LONG_DESCRIPTION, - + long_description=Path("README.md").read_text(), + long_description_content_type='text/markdown', ext_modules=ext_modules, cmdclass=cmdclass, packages=find_packages(), From 5a0166f873cb646f60f04931db133ea1d405696b Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Thu, 18 Jul 2024 11:39:48 -0400 Subject: [PATCH 2/4] update README image paths --- README.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b876aaf8b2..a715b9c3e5 100644 --- a/README.md +++ b/README.md @@ -123,21 +123,21 @@ Adopters | Company | Logo | Industry | Use of Gensim | |---------|------|----------|---------------| -| [RARE Technologies](https://rare-technologies.com/) | ![rare](docs/src/readme_images/rare.png) | ML & NLP consulting | Creators of Gensim – this is us! | -| [Amazon](http://www.amazon.com/) | ![amazon](docs/src/readme_images/amazon.png) | Retail | Document similarity. | -| [National Institutes of Health](https://github.com/NIHOPA/pipeline_word2vec) | ![nih](docs/src/readme_images/nih.png) | Health | Processing grants and publications with word2vec. | -| [Cisco Security](http://www.cisco.com/c/en/us/products/security/index.html) | ![cisco](docs/src/readme_images/cisco.png) | Security | Large-scale fraud detection. | -| [Mindseye](http://www.mindseyesolutions.com/) | ![mindseye](docs/src/readme_images/mindseye.png) | Legal | Similarities in legal documents. | -| [Channel 4](http://www.channel4.com/) | ![channel4](docs/src/readme_images/channel4.png) | Media | Recommendation engine. | -| [Talentpair](http://talentpair.com) | ![talent-pair](docs/src/readme_images/talent-pair.png) | HR | Candidate matching in high-touch recruiting. | -| [Juju](http://www.juju.com/) | ![juju](docs/src/readme_images/juju.png) | HR | Provide non-obvious related job suggestions. | -| [Tailwind](https://www.tailwindapp.com/) | ![tailwind](docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. | -| [Issuu](https://issuu.com/) | ![issuu](docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. | -| [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. | -| [12K Research](https://12k.com/) | ![12k](docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | -| [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. | -| [SiteGround](https://www.siteground.com/) | ![siteground](docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. | -| [Capital One](https://www.capitalone.com/) | ![capitalone](docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. | +| [RARE Technologies](https://rare-technologies.com/) | ![rare](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/rare.png) | ML & NLP consulting | Creators of Gensim – this is us! | +| [Amazon](http://www.amazon.com/) | ![amazon](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/amazon.png) | Retail | Document similarity. | +| [National Institutes of Health](https://github.com/NIHOPA/pipeline_word2vec) | ![nih](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/nih.png) | Health | Processing grants and publications with word2vec. | +| [Cisco Security](http://www.cisco.com/c/en/us/products/security/index.html) | ![cisco](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/cisco.png) | Security | Large-scale fraud detection. | +| [Mindseye](http://www.mindseyesolutions.com/) | ![mindseye](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/mindseye.png) | Legal | Similarities in legal documents. | +| [Channel 4](http://www.channel4.com/) | ![channel4](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/channel4.png) | Media | Recommendation engine. | +| [Talentpair](http://talentpair.com) | ![talent-pair](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/talent-pair.png) | HR | Candidate matching in high-touch recruiting. | +| [Juju](http://www.juju.com/) | ![juju](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/juju.png) | HR | Provide non-obvious related job suggestions. | +| [Tailwind](https://www.tailwindapp.com/) | ![tailwind](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. | +| [Issuu](https://issuu.com/) | ![issuu](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. | +| [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. | +| [12K Research](https://12k.com/) | ![12k](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | +| [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. | +| [SiteGround](https://www.siteground.com/) | ![siteground](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. | +| [Capital One](https://www.capitalone.com/) | ![capitalone](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. | ------- @@ -179,4 +179,3 @@ BibTeX entry: [OpenBLAS]: https://xianyi.github.io/OpenBLAS/ [source tar.gz]: https://pypi.org/project/gensim/ [documentation]: https://radimrehurek.com/gensim/#install - From 17f1113de64ff2ad967bfb4b264992d5c5bfc757 Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Mon, 22 Jul 2024 10:29:43 -0400 Subject: [PATCH 3/4] revert raw git urls in README.md --- README.md | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a715b9c3e5..b876aaf8b2 100644 --- a/README.md +++ b/README.md @@ -123,21 +123,21 @@ Adopters | Company | Logo | Industry | Use of Gensim | |---------|------|----------|---------------| -| [RARE Technologies](https://rare-technologies.com/) | ![rare](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/rare.png) | ML & NLP consulting | Creators of Gensim – this is us! | -| [Amazon](http://www.amazon.com/) | ![amazon](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/amazon.png) | Retail | Document similarity. | -| [National Institutes of Health](https://github.com/NIHOPA/pipeline_word2vec) | ![nih](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/nih.png) | Health | Processing grants and publications with word2vec. | -| [Cisco Security](http://www.cisco.com/c/en/us/products/security/index.html) | ![cisco](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/cisco.png) | Security | Large-scale fraud detection. | -| [Mindseye](http://www.mindseyesolutions.com/) | ![mindseye](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/mindseye.png) | Legal | Similarities in legal documents. | -| [Channel 4](http://www.channel4.com/) | ![channel4](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/channel4.png) | Media | Recommendation engine. | -| [Talentpair](http://talentpair.com) | ![talent-pair](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/talent-pair.png) | HR | Candidate matching in high-touch recruiting. | -| [Juju](http://www.juju.com/) | ![juju](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/juju.png) | HR | Provide non-obvious related job suggestions. | -| [Tailwind](https://www.tailwindapp.com/) | ![tailwind](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. | -| [Issuu](https://issuu.com/) | ![issuu](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. | -| [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. | -| [12K Research](https://12k.com/) | ![12k](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | -| [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. | -| [SiteGround](https://www.siteground.com/) | ![siteground](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. | -| [Capital One](https://www.capitalone.com/) | ![capitalone](https://raw.githubusercontent.com/piskvorky/gensim/develop/docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. | +| [RARE Technologies](https://rare-technologies.com/) | ![rare](docs/src/readme_images/rare.png) | ML & NLP consulting | Creators of Gensim – this is us! | +| [Amazon](http://www.amazon.com/) | ![amazon](docs/src/readme_images/amazon.png) | Retail | Document similarity. | +| [National Institutes of Health](https://github.com/NIHOPA/pipeline_word2vec) | ![nih](docs/src/readme_images/nih.png) | Health | Processing grants and publications with word2vec. | +| [Cisco Security](http://www.cisco.com/c/en/us/products/security/index.html) | ![cisco](docs/src/readme_images/cisco.png) | Security | Large-scale fraud detection. | +| [Mindseye](http://www.mindseyesolutions.com/) | ![mindseye](docs/src/readme_images/mindseye.png) | Legal | Similarities in legal documents. | +| [Channel 4](http://www.channel4.com/) | ![channel4](docs/src/readme_images/channel4.png) | Media | Recommendation engine. | +| [Talentpair](http://talentpair.com) | ![talent-pair](docs/src/readme_images/talent-pair.png) | HR | Candidate matching in high-touch recruiting. | +| [Juju](http://www.juju.com/) | ![juju](docs/src/readme_images/juju.png) | HR | Provide non-obvious related job suggestions. | +| [Tailwind](https://www.tailwindapp.com/) | ![tailwind](docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. | +| [Issuu](https://issuu.com/) | ![issuu](docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. | +| [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. | +| [12K Research](https://12k.com/) | ![12k](docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | +| [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. | +| [SiteGround](https://www.siteground.com/) | ![siteground](docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. | +| [Capital One](https://www.capitalone.com/) | ![capitalone](docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. | ------- @@ -179,3 +179,4 @@ BibTeX entry: [OpenBLAS]: https://xianyi.github.io/OpenBLAS/ [source tar.gz]: https://pypi.org/project/gensim/ [documentation]: https://radimrehurek.com/gensim/#install + From 681cceb5bd278c6afcec799b39afbd61f67c8a10 Mon Sep 17 00:00:00 2001 From: jicruz96 Date: Mon, 22 Jul 2024 10:30:34 -0400 Subject: [PATCH 4/4] dynamically update image paths in README during setup.py build to use git urls --- setup.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 004e94dadd..50f03cf70e 100644 --- a/setup.py +++ b/setup.py @@ -84,6 +84,18 @@ def make_cpp_ext(use_cython=False): ) +def get_long_description(): + long_description = Path("README.md").read_text(encoding="utf-8") + # + # We update the image paths in the README to a GitHub URL so that they render correctly on PyPI. + # https://stackoverflow.com/questions/41983209/how-do-i-add-images-to-a-pypi-readme-that-works-on-github + # + docs_path = "docs/src/readme_images/" + raw_github_url = "https://raw.githubusercontent.com/piskvorky/gensim/master/" + docs_path + long_description = long_description.replace(docs_path, raw_github_url) + return long_description + + # # We use use_cython=False here for two reasons: # @@ -237,7 +249,7 @@ def run(self): name='gensim', version='4.3.2.dev0', description='Python framework for fast Vector Space Modelling', - long_description=Path("README.md").read_text(), + long_description=get_long_description(), long_description_content_type='text/markdown', ext_modules=ext_modules, cmdclass=cmdclass,