From 055e71ce561d8462867aa02519ab5593a8cd8cfa Mon Sep 17 00:00:00 2001 From: Max Irwin Date: Thu, 10 Dec 2020 17:11:17 -0500 Subject: [PATCH] MD to RST --- README.rst | 222 ++++++++++++++++++++++++++++++++++++++++++++ convert_md_2_rst.py | 43 +++++++++ publish.sh | 1 + setup.py | 2 +- 4 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 README.rst create mode 100644 convert_md_2_rst.py diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..931f010 --- /dev/null +++ b/README.rst @@ -0,0 +1,222 @@ +Skipchunk +========= + +|Pypi| + +|Travis build status| + +|Documentation Status| + +Easy search autosuggest with NLP magic. + +Out of the box it provides a hassle-free autosuggest for any corpus from +scratch, and latent knowledge graph extraction and exploration. + +- Free software: MIT License + +- Documentation: https://skipchunk.readthedocs.io. + +Install +------- + +.. code:: bash + + + pip install skipchunk + + python -m spacy download 'en_core_web_lg' + + python -m nltk.downloader wordnet + +You also need to have Solr or Elasticsearch installed and running +somewhere! + +The current Solr supported version is 8.4.1, but it might work on other +versions. + +The current Elasticsearch supported version is 7.6.2, but it might work +on other versions. + +Use It! +------- + +See the ``./example/`` folder for an end-to-end OSC blog load: + +Solr +~~~~ + +Start Solr first! Doesn’t work with Solr cloud yet, but we’re working on +it. + +You’ll need to start solr using skipchunk’s solr_home directory for now. + +Then run this: ``python solr-blog-example.py`` + +Elasticsearch +~~~~~~~~~~~~~ + +Start Elasticsearch first! + +Then run this: ``python elasticsearch-blog-example.py`` + +Features +-------- + +- Identifies and groups the noun phrases and verb phrases in a corpus + +- Indexes these phrases in Solr or Elasticsearch for a really good + out-of-the-box autosuggest + +- Structures the phrases as a graph so that + concept-relationship-concept can be easily found + +- Meant to handle batched updates as part of a full stack search + platform + +Library API +----------- + +Engine configuration +~~~~~~~~~~~~~~~~~~~~ + +You need an engine_config, as a dict, to create skipchunk. + +The dict must contain the following entries + +- host (the fully qualified URL of the engine web API endpoint) + +- name (the name of the graph) + +- path (the on-disk location of stateful data that will be kept) + +- engine_name (either “solr” or “elasticsearch”) + +Solr engine config example +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: python + + + engine_config_solr = { + + "host":"http://localhost:8983/solr/", + + "name":"osc-blog", + + "path":"./skipchunk_data", + + "engine_name":"solr" + + } + +Elasticsearch engine config example +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: python + + + engine_config_elasticsearch = { + + "host":"http://localhost:9200/", + + "name":"osc-blog", + + "path":"./skipchunk_data", + + "engine_name":"elasticsearch" + + } + +Skipchunk Initialization +~~~~~~~~~~~~~~~~~~~~~~~~ + +When initializing Skipchunk, you will need to provide the constructor +with the following parameters + +- engine_config (the dict containing search engine connection details) + +- spacy_model=“en_core_web_lg” (the spacy model to use to parse text) + +- minconceptlength=1 (the minimum number of words that can appear in a + noun phrase) + +- maxconceptlength=3 (the maximum number of words that can appear in a + noun phrase) + +- minpredicatelength=1 (the minimum number of words that can appear in + a verb phrase) + +- maxpredicatelength=3 (the maximum number of words that can appear in + a verb phrase) + +- minlabels=1 (the number of times a concept/predicate must appear + before it is recognized and kept. The lower this number, the more + concepts will be kept - so be careful with large content sets!) + +- cache_documents=False + +- cache_pickle=False + +Skipchunk Methods +~~~~~~~~~~~~~~~~~ + +- ``tuplize(filename=source,fields=['title','content',...])`` (Produces + a list of (text,document) tuples ready for processing by the + enrichment.) + +- ``enrich(tuples)`` (Enriching can take a long time if you provide + lots of text. Consider batching at 10k docs at a time.) + +- ``save`` (Saves to pickle) + +- ``load`` (Loads from pickle) + +Graph API +~~~~~~~~~ + +After enrichment, you can then index the graph into the engine + +- ``index(skipchunk:Skipchunk)`` (Updates the knowledge graph in the + search engine) + +- ``delete`` (Deletes a knowledge graph - be careful!) + +After indexing, you can call these methods to get autocompleted concepts +or walk the knowledge graph + +- ``conceptVerbConcepts(concept:str,verb:str,mincount=1,limit=100) -> list`` + ( Accepts a verb to find the concepts appearing in the same context) + +- ``conceptsNearVerb(verb:str,mincount=1,limit=100) -> list`` ( Accepts + a verb to find the concepts appearing in the same context) + +- ``verbsNearConcept(concept:str,mincount=1,limit=100) -> list`` ( + Accepts a concept to find the verbs appearing in the same context) + +- ``suggestConcepts(prefix:str,build=False) -> list`` ( Suggests a list + of concepts given a prefix) + +- ``suggestPredicates(prefix:str,build=False) -> list`` ( Suggests a + list of predicates given a prefix) + +- ``summarize(mincount=1,limit=100) -> list`` ( Summarizes a core) + +- ``graph(subject:str,objects=5,branches=10) -> list`` ( Gets the + subject-predicate-object neighborhood graph for a subject) + +Credits +------- + +Developed by Max Irwin, OpenSource Connections +https://opensourceconnections.com + +All the blog posts contained in the example directory are copyright +OpenSource Connections, and may not be used nor redistributed without +permission + +.. |Pypi| image:: https://img.shields.io/pypi/v/skipchunk.svg + :target: https://pypi.python.org/pypi/skipchunk +.. |Travis build status| image:: https://img.shields.io/travis/binarymax/skipchunk.svg + :target: https://travis-ci.org/binarymax/skipchunk +.. |Documentation Status| image:: https://readthedocs.org/projects/skipchunk/badge/?version=latest + :target: https://skipchunk.readthedocs.io/en/latest/?badge=latest diff --git a/convert_md_2_rst.py b/convert_md_2_rst.py new file mode 100644 index 0000000..cc78562 --- /dev/null +++ b/convert_md_2_rst.py @@ -0,0 +1,43 @@ +""" +https://github.com/listatree/convert_md_2_rst/ + +Convert Markdown to reStructuredText extension for Sphinx Doc +Scans for '.md' files and converts them to '.rst' files using pandoc. +For use it just copy this file to your source directory and add +'convert_md_2_rst' to the 'extensions' value of your 'conf.py' file. +Ensure that the source path is in the Python sys path. For that +purpose you may add this line to 'conf.py': +sys.path.insert(0, os.path.abspath('.')) + +""" + +import os +import pypandoc + +def setup(): + path = os.path.abspath('.') + for dir,subdir,files in os.walk(path): + for file in files: + filename = os.path.join(dir, file) + filename_parts = os.path.splitext(filename) + if len(filename_parts) > 1: + filename_ext = filename_parts[1] + if filename_ext == '.md': + convert_md_2_rst_process(filename_parts[0]) + +def convert_md_2_rst_process(filename_root): + filename_source = filename_root + ".md" + filename_target = filename_root + ".rst" + print('Converting', os.path.basename(filename_source), 'to', os.path.basename(filename_target)) + file_source = open(filename_source) + lines = file_source.readlines() + file_source.close() + data = '\n'.join(lines) + data = data.encode('utf-8') + data = pypandoc.convert(data, 'rst', format='markdown') + file_target = open(filename_target, "w") + file_target.write(data) + file_target.flush() + file_target.close() + +setup() \ No newline at end of file diff --git a/publish.sh b/publish.sh index 5e37c1c..1b61994 100755 --- a/publish.sh +++ b/publish.sh @@ -7,5 +7,6 @@ # 6. Go and run this script! mv dist/* ./dist.old/ +python convert_md_2_rst.py python setup.py sdist bdist_wheel python -m twine upload --skip-existing dist/* \ No newline at end of file diff --git a/setup.py b/setup.py index 810b60a..47eff71 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, find_packages -with open('README.md') as readme_file: +with open('README.rst') as readme_file: readme = readme_file.read() with open('HISTORY.rst') as history_file: