diff --git a/.ipynb_checkpoints/MongoDB-checkpoint.ipynb b/.ipynb_checkpoints/MongoDB-checkpoint.ipynb new file mode 100644 index 0000000..2eccdba --- /dev/null +++ b/.ipynb_checkpoints/MongoDB-checkpoint.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pprint\n", + "import re\n", + "import pymongo, json\n", + "\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "\n", + "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", + "db = client ['fdac19mp2']\n", + "coll = db ['jball16']\n", + "#1\n", + "coll.insert_one ( { 'topic':'GitHub bugs', 'title': 'bugfiles', 'license': 'http://creativecommons.org/licenses/by/4.0/legalcode', 'description': 'This dataset contains bug reports, commit history, and API descriptions of six open source Java projects including Eclipse Platform UI, SWT, JDT, AspectJ, Birt, and Tomcat. This dataset was used to evaluate a learning to rank approach that recommends relevant files for bug reports.', 'urls': [ 'https://doi.org/10.5281/zenodo.268486' ] } )\n", + "#2\n", + "coll.insert_one ( { 'topic':'Github bugs', 'title': 'GitHub issue titles and descriptions for NLP analysis.', 'license': 'MIT License, Copyright (c) 2018 David Shinn, Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.', 'description': 'Over 8 million GitHub issue titles and descriptions from 2017. Prepared from instructions at How To Create Data Products That Are Magical Using Sequence-to-Sequence Models.', 'urls': [ 'https://www.githubarchive.org/' ] } )\n", + "#3\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackSample: 10% of Stack Overflow Q&A', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website. This is organized as three tables: Questions contains the title, body, creation date, closed date (if applicable), score, and owner ID for all non-deleted Stack Overflow questions whose Id is a multiple of 10. Answers contains the body, creation date, score, and owner ID for each of the answers to these questions. The ParentId column links back to the Questions table. Tags contains the tags on each of these questions', 'urls': [ 'https://www.kaggle.com/stackoverflow/stacksample' ] } )\n", + "#4\n", + "coll.insert_one ( { 'topic':'Python questions on StackOverflow', 'title': 'Python Questions from Stack Overflow', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of all questions and answers from Stack Overflow that are tagged with the python tag. Useful for natural language processing and community analysis.', 'urls': [ 'https://www.kaggle.com/stackoverflow/pythonquestions' ] } )\n", + "#5\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackLite: Stack Overflow questions and tags', 'license': 'http://opendatacommons.org/licenses/dbcl/1.0/', 'description': 'A dataset of Stack Overflow programming questions. For each question, it includes: Question ID, Creation date, Closed date, if applicable, Score, Owner user ID, Number of answers, Tags', 'urls': [ 'https://archive.org/details/stackexchange' ] } )\n", + "#6\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'R Questions from Stack Overflow', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of questions and answers from Stack Overflow that are tagged with the r tag, useful for natural language processing and community analysis.', 'urls': [ 'https://www.kaggle.com/stackoverflow/rquestions' ] } )\n", + "#7\n", + "coll.insert_one ( { 'topic':'StackOverflow BigQuery Dataset', 'title': 'Stack Overflow Data', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'Updated on a quarterly basis, this BigQuery dataset includes an archive of Stack Overflow content, including posts, votes, tags, and badges. This dataset is updated to mirror the Stack Overflow content on the Internet Archive, and is also available through the Stack Exchange Data Explorer.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stackoverflow' ] } )\n", + "#8\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackOverflow Questions and Answers', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'Contains the top 50,000 answers (by vote count, per month) on StackOverflow and their questions, where the answers were created between April 1, 2019 and December 1, 2015 (non-inclusive). The dataset excludes answers that contain a hyperlink (i.e. have an anchor tag in them).', 'urls': [ 'https://www.kaggle.com/metrovirus/stackoverflow' ] } )\n", + "#9\n", + "coll.insert_one ( { 'topic':'StackExchange questions', 'title': 'Questions from Cross Validated Stack Exchange', 'license': 'All Stack Exchange user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of questions and answers from Cross Validated, the statistics and machine learning Q&A site from the Stack Exchange network.', 'urls': [ 'https://www.kaggle.com/stackoverflow/statsquestions' ] } )\n", + "#10\n", + "coll.insert_one ( { 'topic':'Pandas issues', 'title': 'Pandas QA on Stack Overflow', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': '', 'urls': [ 'https://www.kaggle.com/vivek42/pandas-qa-on-stack-overflow' ] } )\n", + "#11\n", + "coll.insert_one ( { 'topic':'StackOverflow', 'title': 'Stack Overflow 2018 questions data set', 'license': 'Unknown', 'description': 'In this dataset, we explore StackOverflow questions and try to use unsupervised algorithms to extract tags, then train classifiers capable of suggesting tags to users who submit a question.', 'urls': [ 'https://www.kaggle.com/moulhanout/stack-overflow-2018-questions-data-set' ] } )\n", + "#12\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'Stack Overflow', 'license': 'Unknown', 'description': 'Archive of Stack Overflow posts, votes, tags and badges', 'urls': [ 'https://console.cloud.google.com/gcr/images/tensorflow/GLOBAL/tpu-models?filter=solution-type%3Adataset&filter=category%3Asocial&subtask=details&subtaskValue=stack-exchange%2Fstack-overflow' ] } )\n", + "#13\n", + "coll.insert_one ( { 'topic':'StackExchange', 'title': 'StackExchangeData', 'license': 'Unknown', 'description': '', 'urls': [ 'https://www.kaggle.com/dklvch/stackexchangedata' ] } )\n", + "#14\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'curated_stackoverflow_dataset_for_Q_&_A', 'license': 'Unknown', 'description': 'Q&A project', 'urls': [ 'https://www.kaggle.com/pushpendra7/curated-stackoverflow-dataset-for-q-a/discussion?sortBy=hot&group=upvoted' ] } )\n", + "#15\n", + "coll.insert_one ( { 'topic':'GitHub commits', 'title': '452M commits on GitHub', 'license': 'CC BY-NC', 'description': 'part-000xx.lzo - LZO archives with the data (refer to \"Format\"). part-000xx.lzo.index - LZO index files so that the archives are splittable in Hadoop. stats.csv.gz - GZIP-ed CSV file with some repository statistics related to the commits. Format part-000xx - text, one line per repository, every line is JSON. Date and time format is mostly Go language\\'s time.Time.String(), we recommend using dateutil.parse() to parse it with Python. Commit message contains explicit \\r and symbols in order to be a single line. stats.csv has 4 columns: repository name, number of commits, number of contributors, average length of the commit messages.', 'urls': [ 'https://zenodo.org/record/285467', 'https://data.world/vmarkovtsev/452-m-commits-on-github' ] } )\n", + "#16\n", + "coll.insert_one ( { 'topic':'Java projects on GitHub', 'title': 'GitHub Java Corpus', 'license': 'https://creativecommons.org/licenses/by/4.0', 'description': 'The GitHub Java Corpus is a snapshot of all open-source Java code on GitHub in October 2012 that is contained in open-source projects that at the time had at least one fork. It contains code from 14,785 projects amounting to about 352 million lines of code. The dataset has been used to study coding practice in Java at a large scale.', 'urls': [ 'https://datashare.is.ed.ac.uk/handle/10283/2334', 'https://search.datacite.org/works/10.7488/ds/1690' ] } )\n", + "#17\n", + "coll.insert_one ( { 'topic':'Source code typos', 'title': 'typos', 'license': 'ODC-ODbL', 'description': '7375 typos developers made in source code identifiers, e.g. class names, function names, variable names, and fixed them on GitHub. See the Origin section about how they were mined.', 'urls': [ 'https://data.world/source-d/typos' ] } )\n", + "#18\n", + "coll.insert_one ( { 'topic':'StackOverflow questions', 'title': 'Stackoverflow question favourites', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': '', 'urls': [ 'https://www.kaggle.com/iancuv/stackoverflow-question-favourites' ] } )\n", + "#19\n", + "coll.insert_one ( { 'topic':'StackOverflow', 'title': 'stackoverflow', 'license': 'Unknown', 'description': '', 'urls': [ 'https://www.kaggle.com/pankajkarki/stackoverflow' ] } )\n", + "#20\n", + "coll.insert_one ( { 'topic':'StackOverflow tags', 'title': 'Stack Overflow Tag Network', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'On the data team at Stack Overflow, we spend a lot of time and energy thinking about tech ecosystems and how technologies are related to each other. One way to get at this idea of relationships between technologies is tag correlations, how often technology tags at Stack Overflow appear together relative to how often they appear separately. One place we see developers using tags at Stack Overflow is on their Developer Stories, or professional profiles/CVs/resumes. If we are interested in how technologies are connected and how they are used together, developers\\' own descriptions of their work and careers is a great place to get that.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stack-overflow-tag-network' ] } )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': ObjectId('5d8514c46f91803ac7230c59'),\n", + " 'description': 'This dataset contains bug reports, commit '\n", + " 'history, and API descriptions of six open '\n", + " 'source Java projects including Eclipse '\n", + " 'Platform UI, SWT, JDT, AspectJ, Birt, and '\n", + " 'Tomcat. This dataset was used to evaluate a '\n", + " 'learning to rank approach that recommends '\n", + " 'relevant files for bug reports.',\n", + " 'license': 'http://creativecommons.org/licenses/by/4.0/legalcode',\n", + " 'title': 'bugfiles',\n", + " 'topic': 'GitHub bugs',\n", + " 'urls': ['https://doi.org/10.5281/zenodo.268486']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5a'),\n", + " 'description': 'Over 8 million GitHub issue titles and '\n", + " 'descriptions from 2017. Prepared from '\n", + " 'instructions at How To Create Data Products '\n", + " 'That Are Magical Using Sequence-to-Sequence '\n", + " 'Models.',\n", + " 'license': 'MIT License, Copyright (c) 2018 David Shinn, '\n", + " 'Permission is hereby granted, free of charge, to '\n", + " 'any person obtaining a copy of this software and '\n", + " 'associated documentation files (the \"Software\"), '\n", + " 'to deal in the Software without restriction, '\n", + " 'including without limitation the rights to use, '\n", + " 'copy, modify, merge, publish, distribute, '\n", + " 'sublicense, and/or sell copies of the Software, '\n", + " 'and to permit persons to whom the Software is '\n", + " 'furnished to do so, subject to the following '\n", + " 'conditions: The above copyright notice and this '\n", + " 'permission notice shall be included in all copies '\n", + " 'or substantial portions of the Software. THE '\n", + " 'SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF '\n", + " 'ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT '\n", + " 'LIMITED TO THE WARRANTIES OF MERCHANTABILITY, '\n", + " 'FITNESS FOR A PARTICULAR PURPOSE AND '\n", + " 'NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR '\n", + " 'COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES '\n", + " 'OR OTHER LIABILITY, WHETHER IN AN ACTION OF '\n", + " 'CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF '\n", + " 'OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR '\n", + " 'OTHER DEALINGS IN THE SOFTWARE.',\n", + " 'title': 'GitHub issue titles and descriptions for NLP '\n", + " 'analysis.',\n", + " 'topic': 'Github bugs',\n", + " 'urls': ['https://www.githubarchive.org/']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5b'),\n", + " 'description': 'Dataset with the text of 10% of questions and '\n", + " 'answers from the Stack Overflow programming '\n", + " 'Q&A website. This is organized as three '\n", + " 'tables: Questions contains the title, body, '\n", + " 'creation date, closed date (if applicable), '\n", + " 'score, and owner ID for all non-deleted Stack '\n", + " 'Overflow questions whose Id is a multiple of '\n", + " '10. Answers contains the body, creation date, '\n", + " 'score, and owner ID for each of the answers to '\n", + " 'these questions. The ParentId column links '\n", + " 'back to the Questions table. Tags contains the '\n", + " 'tags on each of these questions',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'StackSample: 10% of Stack Overflow Q&A',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stacksample']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5c'),\n", + " 'description': 'Full text of all questions and answers from '\n", + " 'Stack Overflow that are tagged with the python '\n", + " 'tag. Useful for natural language processing '\n", + " 'and community analysis.',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'Python Questions from Stack Overflow',\n", + " 'topic': 'Python questions on StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/pythonquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5d'),\n", + " 'description': 'A dataset of Stack Overflow programming '\n", + " 'questions. For each question, it includes: '\n", + " 'Question ID, Creation date, Closed date, if '\n", + " 'applicable, Score, Owner user ID, Number of '\n", + " 'answers, Tags',\n", + " 'license': 'http://opendatacommons.org/licenses/dbcl/1.0/',\n", + " 'title': 'StackLite: Stack Overflow questions and tags',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://archive.org/details/stackexchange']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5e'),\n", + " 'description': 'Full text of questions and answers from Stack '\n", + " 'Overflow that are tagged with the r tag, '\n", + " 'useful for natural language processing and '\n", + " 'community analysis.',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'R Questions from Stack Overflow',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/rquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5f'),\n", + " 'description': 'Updated on a quarterly basis, this BigQuery '\n", + " 'dataset includes an archive of Stack Overflow '\n", + " 'content, including posts, votes, tags, and '\n", + " 'badges. This dataset is updated to mirror the '\n", + " 'Stack Overflow content on the Internet '\n", + " 'Archive, and is also available through the '\n", + " 'Stack Exchange Data Explorer.',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stack Overflow Data',\n", + " 'topic': 'StackOverflow BigQuery Dataset',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c60'),\n", + " 'description': 'Contains the top 50,000 answers (by vote '\n", + " 'count, per month) on StackOverflow and their '\n", + " 'questions, where the answers were created '\n", + " 'between April 1, 2019 and December 1, 2015 '\n", + " '(non-inclusive). The dataset excludes answers '\n", + " 'that contain a hyperlink (i.e. have an anchor '\n", + " 'tag in them).',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'StackOverflow Questions and Answers',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/metrovirus/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c61'),\n", + " 'description': 'Full text of questions and answers from Cross '\n", + " 'Validated, the statistics and machine learning '\n", + " 'Q&A site from the Stack Exchange network.',\n", + " 'license': 'All Stack Exchange user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'Questions from Cross Validated Stack Exchange',\n", + " 'topic': 'StackExchange questions',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/statsquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c62'),\n", + " 'description': '',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Pandas QA on Stack Overflow',\n", + " 'topic': 'Pandas issues',\n", + " 'urls': ['https://www.kaggle.com/vivek42/pandas-qa-on-stack-overflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c63'),\n", + " 'description': 'In this dataset, we explore StackOverflow '\n", + " 'questions and try to use unsupervised '\n", + " 'algorithms to extract tags, then train '\n", + " 'classifiers capable of suggesting tags to '\n", + " 'users who submit a question.',\n", + " 'license': 'Unknown',\n", + " 'title': 'Stack Overflow 2018 questions data set',\n", + " 'topic': 'StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/moulhanout/stack-overflow-2018-questions-data-set']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c64'),\n", + " 'description': 'Archive of Stack Overflow posts, votes, tags '\n", + " 'and badges',\n", + " 'license': 'Unknown',\n", + " 'title': 'Stack Overflow',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://console.cloud.google.com/gcr/images/tensorflow/GLOBAL/tpu-models?filter=solution-type%3Adataset&filter=category%3Asocial&subtask=details&subtaskValue=stack-exchange%2Fstack-overflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c65'),\n", + " 'description': '',\n", + " 'license': 'Unknown',\n", + " 'title': 'StackExchangeData',\n", + " 'topic': 'StackExchange',\n", + " 'urls': ['https://www.kaggle.com/dklvch/stackexchangedata']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c66'),\n", + " 'description': 'Q&A project',\n", + " 'license': 'Unknown',\n", + " 'title': 'curated_stackoverflow_dataset_for_Q_&_A',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/pushpendra7/curated-stackoverflow-dataset-for-q-a/discussion?sortBy=hot&group=upvoted']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c67'),\n", + " 'description': 'part-000xx.lzo - LZO archives with the data '\n", + " '(refer to \"Format\"). part-000xx.lzo.index - '\n", + " 'LZO index files so that the archives are '\n", + " 'splittable in Hadoop. stats.csv.gz - GZIP-ed '\n", + " 'CSV file with some repository statistics '\n", + " 'related to the commits. Format part-000xx - '\n", + " 'text, one line per repository, every line is '\n", + " 'JSON. Date and time format is mostly Go '\n", + " \"language's time.Time.String(), we recommend \"\n", + " 'using dateutil.parse() to parse it with '\n", + " 'Python. Commit message contains explicit \\r'\n", + " ' and symbols in order to be a single line. '\n", + " 'stats.csv has 4 columns: repository name, '\n", + " 'number of commits, number of contributors, '\n", + " 'average length of the commit messages.',\n", + " 'license': 'CC BY-NC',\n", + " 'title': '452M commits on GitHub',\n", + " 'topic': 'GitHub commits',\n", + " 'urls': ['https://zenodo.org/record/285467',\n", + " 'https://data.world/vmarkovtsev/452-m-commits-on-github']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c68'),\n", + " 'description': 'The GitHub Java Corpus is a snapshot of all '\n", + " 'open-source Java code on GitHub in October '\n", + " '2012 that is contained in open-source projects '\n", + " 'that at the time had at least one fork. It '\n", + " 'contains code from 14,785 projects amounting '\n", + " 'to about 352 million lines of code. The '\n", + " 'dataset has been used to study coding practice '\n", + " 'in Java at a large scale.',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0',\n", + " 'title': 'GitHub Java Corpus',\n", + " 'topic': 'Java projects on GitHub',\n", + " 'urls': ['https://datashare.is.ed.ac.uk/handle/10283/2334',\n", + " 'https://search.datacite.org/works/10.7488/ds/1690']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c69'),\n", + " 'description': '7375 typos developers made in source code '\n", + " 'identifiers, e.g. class names, function names, '\n", + " 'variable names, and fixed them on GitHub. See '\n", + " 'the Origin section about how they were mined.',\n", + " 'license': 'ODC-ODbL',\n", + " 'title': 'typos',\n", + " 'topic': 'Source code typos',\n", + " 'urls': ['https://data.world/source-d/typos']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6a'),\n", + " 'description': '',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stackoverflow question favourites',\n", + " 'topic': 'StackOverflow questions',\n", + " 'urls': ['https://www.kaggle.com/iancuv/stackoverflow-question-favourites']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6b'),\n", + " 'description': '',\n", + " 'license': 'Unknown',\n", + " 'title': 'stackoverflow',\n", + " 'topic': 'StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/pankajkarki/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6c'),\n", + " 'description': 'On the data team at Stack Overflow, we spend a '\n", + " 'lot of time and energy thinking about tech '\n", + " 'ecosystems and how technologies are related to '\n", + " 'each other. One way to get at this idea of '\n", + " 'relationships between technologies is tag '\n", + " 'correlations, how often technology tags at '\n", + " 'Stack Overflow appear together relative to how '\n", + " 'often they appear separately. One place we see '\n", + " 'developers using tags at Stack Overflow is on '\n", + " 'their Developer Stories, or professional '\n", + " 'profiles/CVs/resumes. If we are interested in '\n", + " 'how technologies are connected and how they '\n", + " \"are used together, developers' own \"\n", + " 'descriptions of their work and careers is a '\n", + " 'great place to get that.',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stack Overflow Tag Network',\n", + " 'topic': 'StackOverflow tags',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stack-overflow-tag-network']}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import pymongo, json\n", + "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", + "db = client ['fdac19mp2']\n", + "coll = db ['jball16']\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "for r in coll. find():\n", + " print(pp .pformat (r)) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coll.count_documents({})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/MongoDB.ipynb b/MongoDB.ipynb deleted file mode 100644 index b817f97..0000000 --- a/MongoDB.ipynb +++ /dev/null @@ -1,102 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:11: DeprecationWarning: insert is deprecated. Use insert_one or insert_many instead.\n", - " # This is added back by InteractiveShellApp.init_path()\n" - ] - }, - { - "data": { - "text/plain": [ - "ObjectId('5d7e6769e1eaff62f15de0d2')" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pprint\n", - "import re\n", - "import pymongo, json\n", - "\n", - "pp = pprint.PrettyPrinter(indent=1,width=65)\n", - "\n", - "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", - "db = client ['fdac19mp2']\n", - "coll = db ['audris']\n", - "# for each dataset\n", - "coll.insert_one ( { 'topic':'git URLs', 'first dataset': 'largest projects', 'license': 'NA', 'description': 'The list of projects on github with the largest number of starts', 'urls': [ 'url1', 'url2' ] } )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_id': ObjectId('5d7e6769e1eaff62f15de0d2'),\n", - " 'description': 'The list of projects on github with the '\n", - " 'largest number of starts',\n", - " 'first dataset': 'largest projects',\n", - " 'license': 'NA',\n", - " 'topic': 'git URLs',\n", - " 'urls': ['url1', 'url2']}\n" - ] - } - ], - "source": [ - "import pprint\n", - "import pymongo, json\n", - "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", - "db = client ['fdac19mp2']\n", - "coll = db ['audris']\n", - "pp = pprint.PrettyPrinter(indent=1,width=65)\n", - "for r in coll. find():\n", - " print(pp .pformat (r)) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/jball16.ipynb b/jball16.ipynb new file mode 100644 index 0000000..2eccdba --- /dev/null +++ b/jball16.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pprint\n", + "import re\n", + "import pymongo, json\n", + "\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "\n", + "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", + "db = client ['fdac19mp2']\n", + "coll = db ['jball16']\n", + "#1\n", + "coll.insert_one ( { 'topic':'GitHub bugs', 'title': 'bugfiles', 'license': 'http://creativecommons.org/licenses/by/4.0/legalcode', 'description': 'This dataset contains bug reports, commit history, and API descriptions of six open source Java projects including Eclipse Platform UI, SWT, JDT, AspectJ, Birt, and Tomcat. This dataset was used to evaluate a learning to rank approach that recommends relevant files for bug reports.', 'urls': [ 'https://doi.org/10.5281/zenodo.268486' ] } )\n", + "#2\n", + "coll.insert_one ( { 'topic':'Github bugs', 'title': 'GitHub issue titles and descriptions for NLP analysis.', 'license': 'MIT License, Copyright (c) 2018 David Shinn, Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.', 'description': 'Over 8 million GitHub issue titles and descriptions from 2017. Prepared from instructions at How To Create Data Products That Are Magical Using Sequence-to-Sequence Models.', 'urls': [ 'https://www.githubarchive.org/' ] } )\n", + "#3\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackSample: 10% of Stack Overflow Q&A', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Dataset with the text of 10% of questions and answers from the Stack Overflow programming Q&A website. This is organized as three tables: Questions contains the title, body, creation date, closed date (if applicable), score, and owner ID for all non-deleted Stack Overflow questions whose Id is a multiple of 10. Answers contains the body, creation date, score, and owner ID for each of the answers to these questions. The ParentId column links back to the Questions table. Tags contains the tags on each of these questions', 'urls': [ 'https://www.kaggle.com/stackoverflow/stacksample' ] } )\n", + "#4\n", + "coll.insert_one ( { 'topic':'Python questions on StackOverflow', 'title': 'Python Questions from Stack Overflow', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of all questions and answers from Stack Overflow that are tagged with the python tag. Useful for natural language processing and community analysis.', 'urls': [ 'https://www.kaggle.com/stackoverflow/pythonquestions' ] } )\n", + "#5\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackLite: Stack Overflow questions and tags', 'license': 'http://opendatacommons.org/licenses/dbcl/1.0/', 'description': 'A dataset of Stack Overflow programming questions. For each question, it includes: Question ID, Creation date, Closed date, if applicable, Score, Owner user ID, Number of answers, Tags', 'urls': [ 'https://archive.org/details/stackexchange' ] } )\n", + "#6\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'R Questions from Stack Overflow', 'license': 'All Stack Overflow user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of questions and answers from Stack Overflow that are tagged with the r tag, useful for natural language processing and community analysis.', 'urls': [ 'https://www.kaggle.com/stackoverflow/rquestions' ] } )\n", + "#7\n", + "coll.insert_one ( { 'topic':'StackOverflow BigQuery Dataset', 'title': 'Stack Overflow Data', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'Updated on a quarterly basis, this BigQuery dataset includes an archive of Stack Overflow content, including posts, votes, tags, and badges. This dataset is updated to mirror the Stack Overflow content on the Internet Archive, and is also available through the Stack Exchange Data Explorer.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stackoverflow' ] } )\n", + "#8\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'StackOverflow Questions and Answers', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'Contains the top 50,000 answers (by vote count, per month) on StackOverflow and their questions, where the answers were created between April 1, 2019 and December 1, 2015 (non-inclusive). The dataset excludes answers that contain a hyperlink (i.e. have an anchor tag in them).', 'urls': [ 'https://www.kaggle.com/metrovirus/stackoverflow' ] } )\n", + "#9\n", + "coll.insert_one ( { 'topic':'StackExchange questions', 'title': 'Questions from Cross Validated Stack Exchange', 'license': 'All Stack Exchange user contributions are licensed under CC-BY-SA 3.0 with attribution required.', 'description': 'Full text of questions and answers from Cross Validated, the statistics and machine learning Q&A site from the Stack Exchange network.', 'urls': [ 'https://www.kaggle.com/stackoverflow/statsquestions' ] } )\n", + "#10\n", + "coll.insert_one ( { 'topic':'Pandas issues', 'title': 'Pandas QA on Stack Overflow', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': '', 'urls': [ 'https://www.kaggle.com/vivek42/pandas-qa-on-stack-overflow' ] } )\n", + "#11\n", + "coll.insert_one ( { 'topic':'StackOverflow', 'title': 'Stack Overflow 2018 questions data set', 'license': 'Unknown', 'description': 'In this dataset, we explore StackOverflow questions and try to use unsupervised algorithms to extract tags, then train classifiers capable of suggesting tags to users who submit a question.', 'urls': [ 'https://www.kaggle.com/moulhanout/stack-overflow-2018-questions-data-set' ] } )\n", + "#12\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'Stack Overflow', 'license': 'Unknown', 'description': 'Archive of Stack Overflow posts, votes, tags and badges', 'urls': [ 'https://console.cloud.google.com/gcr/images/tensorflow/GLOBAL/tpu-models?filter=solution-type%3Adataset&filter=category%3Asocial&subtask=details&subtaskValue=stack-exchange%2Fstack-overflow' ] } )\n", + "#13\n", + "coll.insert_one ( { 'topic':'StackExchange', 'title': 'StackExchangeData', 'license': 'Unknown', 'description': '', 'urls': [ 'https://www.kaggle.com/dklvch/stackexchangedata' ] } )\n", + "#14\n", + "coll.insert_one ( { 'topic':'StackOverflow questions/answers', 'title': 'curated_stackoverflow_dataset_for_Q_&_A', 'license': 'Unknown', 'description': 'Q&A project', 'urls': [ 'https://www.kaggle.com/pushpendra7/curated-stackoverflow-dataset-for-q-a/discussion?sortBy=hot&group=upvoted' ] } )\n", + "#15\n", + "coll.insert_one ( { 'topic':'GitHub commits', 'title': '452M commits on GitHub', 'license': 'CC BY-NC', 'description': 'part-000xx.lzo - LZO archives with the data (refer to \"Format\"). part-000xx.lzo.index - LZO index files so that the archives are splittable in Hadoop. stats.csv.gz - GZIP-ed CSV file with some repository statistics related to the commits. Format part-000xx - text, one line per repository, every line is JSON. Date and time format is mostly Go language\\'s time.Time.String(), we recommend using dateutil.parse() to parse it with Python. Commit message contains explicit \\r and symbols in order to be a single line. stats.csv has 4 columns: repository name, number of commits, number of contributors, average length of the commit messages.', 'urls': [ 'https://zenodo.org/record/285467', 'https://data.world/vmarkovtsev/452-m-commits-on-github' ] } )\n", + "#16\n", + "coll.insert_one ( { 'topic':'Java projects on GitHub', 'title': 'GitHub Java Corpus', 'license': 'https://creativecommons.org/licenses/by/4.0', 'description': 'The GitHub Java Corpus is a snapshot of all open-source Java code on GitHub in October 2012 that is contained in open-source projects that at the time had at least one fork. It contains code from 14,785 projects amounting to about 352 million lines of code. The dataset has been used to study coding practice in Java at a large scale.', 'urls': [ 'https://datashare.is.ed.ac.uk/handle/10283/2334', 'https://search.datacite.org/works/10.7488/ds/1690' ] } )\n", + "#17\n", + "coll.insert_one ( { 'topic':'Source code typos', 'title': 'typos', 'license': 'ODC-ODbL', 'description': '7375 typos developers made in source code identifiers, e.g. class names, function names, variable names, and fixed them on GitHub. See the Origin section about how they were mined.', 'urls': [ 'https://data.world/source-d/typos' ] } )\n", + "#18\n", + "coll.insert_one ( { 'topic':'StackOverflow questions', 'title': 'Stackoverflow question favourites', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': '', 'urls': [ 'https://www.kaggle.com/iancuv/stackoverflow-question-favourites' ] } )\n", + "#19\n", + "coll.insert_one ( { 'topic':'StackOverflow', 'title': 'stackoverflow', 'license': 'Unknown', 'description': '', 'urls': [ 'https://www.kaggle.com/pankajkarki/stackoverflow' ] } )\n", + "#20\n", + "coll.insert_one ( { 'topic':'StackOverflow tags', 'title': 'Stack Overflow Tag Network', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'description': 'On the data team at Stack Overflow, we spend a lot of time and energy thinking about tech ecosystems and how technologies are related to each other. One way to get at this idea of relationships between technologies is tag correlations, how often technology tags at Stack Overflow appear together relative to how often they appear separately. One place we see developers using tags at Stack Overflow is on their Developer Stories, or professional profiles/CVs/resumes. If we are interested in how technologies are connected and how they are used together, developers\\' own descriptions of their work and careers is a great place to get that.', 'urls': [ 'https://www.kaggle.com/stackoverflow/stack-overflow-tag-network' ] } )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_id': ObjectId('5d8514c46f91803ac7230c59'),\n", + " 'description': 'This dataset contains bug reports, commit '\n", + " 'history, and API descriptions of six open '\n", + " 'source Java projects including Eclipse '\n", + " 'Platform UI, SWT, JDT, AspectJ, Birt, and '\n", + " 'Tomcat. This dataset was used to evaluate a '\n", + " 'learning to rank approach that recommends '\n", + " 'relevant files for bug reports.',\n", + " 'license': 'http://creativecommons.org/licenses/by/4.0/legalcode',\n", + " 'title': 'bugfiles',\n", + " 'topic': 'GitHub bugs',\n", + " 'urls': ['https://doi.org/10.5281/zenodo.268486']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5a'),\n", + " 'description': 'Over 8 million GitHub issue titles and '\n", + " 'descriptions from 2017. Prepared from '\n", + " 'instructions at How To Create Data Products '\n", + " 'That Are Magical Using Sequence-to-Sequence '\n", + " 'Models.',\n", + " 'license': 'MIT License, Copyright (c) 2018 David Shinn, '\n", + " 'Permission is hereby granted, free of charge, to '\n", + " 'any person obtaining a copy of this software and '\n", + " 'associated documentation files (the \"Software\"), '\n", + " 'to deal in the Software without restriction, '\n", + " 'including without limitation the rights to use, '\n", + " 'copy, modify, merge, publish, distribute, '\n", + " 'sublicense, and/or sell copies of the Software, '\n", + " 'and to permit persons to whom the Software is '\n", + " 'furnished to do so, subject to the following '\n", + " 'conditions: The above copyright notice and this '\n", + " 'permission notice shall be included in all copies '\n", + " 'or substantial portions of the Software. THE '\n", + " 'SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF '\n", + " 'ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT '\n", + " 'LIMITED TO THE WARRANTIES OF MERCHANTABILITY, '\n", + " 'FITNESS FOR A PARTICULAR PURPOSE AND '\n", + " 'NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR '\n", + " 'COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES '\n", + " 'OR OTHER LIABILITY, WHETHER IN AN ACTION OF '\n", + " 'CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF '\n", + " 'OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR '\n", + " 'OTHER DEALINGS IN THE SOFTWARE.',\n", + " 'title': 'GitHub issue titles and descriptions for NLP '\n", + " 'analysis.',\n", + " 'topic': 'Github bugs',\n", + " 'urls': ['https://www.githubarchive.org/']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5b'),\n", + " 'description': 'Dataset with the text of 10% of questions and '\n", + " 'answers from the Stack Overflow programming '\n", + " 'Q&A website. This is organized as three '\n", + " 'tables: Questions contains the title, body, '\n", + " 'creation date, closed date (if applicable), '\n", + " 'score, and owner ID for all non-deleted Stack '\n", + " 'Overflow questions whose Id is a multiple of '\n", + " '10. Answers contains the body, creation date, '\n", + " 'score, and owner ID for each of the answers to '\n", + " 'these questions. The ParentId column links '\n", + " 'back to the Questions table. Tags contains the '\n", + " 'tags on each of these questions',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'StackSample: 10% of Stack Overflow Q&A',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stacksample']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5c'),\n", + " 'description': 'Full text of all questions and answers from '\n", + " 'Stack Overflow that are tagged with the python '\n", + " 'tag. Useful for natural language processing '\n", + " 'and community analysis.',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'Python Questions from Stack Overflow',\n", + " 'topic': 'Python questions on StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/pythonquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5d'),\n", + " 'description': 'A dataset of Stack Overflow programming '\n", + " 'questions. For each question, it includes: '\n", + " 'Question ID, Creation date, Closed date, if '\n", + " 'applicable, Score, Owner user ID, Number of '\n", + " 'answers, Tags',\n", + " 'license': 'http://opendatacommons.org/licenses/dbcl/1.0/',\n", + " 'title': 'StackLite: Stack Overflow questions and tags',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://archive.org/details/stackexchange']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5e'),\n", + " 'description': 'Full text of questions and answers from Stack '\n", + " 'Overflow that are tagged with the r tag, '\n", + " 'useful for natural language processing and '\n", + " 'community analysis.',\n", + " 'license': 'All Stack Overflow user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'R Questions from Stack Overflow',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/rquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c5f'),\n", + " 'description': 'Updated on a quarterly basis, this BigQuery '\n", + " 'dataset includes an archive of Stack Overflow '\n", + " 'content, including posts, votes, tags, and '\n", + " 'badges. This dataset is updated to mirror the '\n", + " 'Stack Overflow content on the Internet '\n", + " 'Archive, and is also available through the '\n", + " 'Stack Exchange Data Explorer.',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stack Overflow Data',\n", + " 'topic': 'StackOverflow BigQuery Dataset',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c60'),\n", + " 'description': 'Contains the top 50,000 answers (by vote '\n", + " 'count, per month) on StackOverflow and their '\n", + " 'questions, where the answers were created '\n", + " 'between April 1, 2019 and December 1, 2015 '\n", + " '(non-inclusive). The dataset excludes answers '\n", + " 'that contain a hyperlink (i.e. have an anchor '\n", + " 'tag in them).',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'StackOverflow Questions and Answers',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/metrovirus/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c61'),\n", + " 'description': 'Full text of questions and answers from Cross '\n", + " 'Validated, the statistics and machine learning '\n", + " 'Q&A site from the Stack Exchange network.',\n", + " 'license': 'All Stack Exchange user contributions are licensed '\n", + " 'under CC-BY-SA 3.0 with attribution required.',\n", + " 'title': 'Questions from Cross Validated Stack Exchange',\n", + " 'topic': 'StackExchange questions',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/statsquestions']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c62'),\n", + " 'description': '',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Pandas QA on Stack Overflow',\n", + " 'topic': 'Pandas issues',\n", + " 'urls': ['https://www.kaggle.com/vivek42/pandas-qa-on-stack-overflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c63'),\n", + " 'description': 'In this dataset, we explore StackOverflow '\n", + " 'questions and try to use unsupervised '\n", + " 'algorithms to extract tags, then train '\n", + " 'classifiers capable of suggesting tags to '\n", + " 'users who submit a question.',\n", + " 'license': 'Unknown',\n", + " 'title': 'Stack Overflow 2018 questions data set',\n", + " 'topic': 'StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/moulhanout/stack-overflow-2018-questions-data-set']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c64'),\n", + " 'description': 'Archive of Stack Overflow posts, votes, tags '\n", + " 'and badges',\n", + " 'license': 'Unknown',\n", + " 'title': 'Stack Overflow',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://console.cloud.google.com/gcr/images/tensorflow/GLOBAL/tpu-models?filter=solution-type%3Adataset&filter=category%3Asocial&subtask=details&subtaskValue=stack-exchange%2Fstack-overflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c65'),\n", + " 'description': '',\n", + " 'license': 'Unknown',\n", + " 'title': 'StackExchangeData',\n", + " 'topic': 'StackExchange',\n", + " 'urls': ['https://www.kaggle.com/dklvch/stackexchangedata']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c66'),\n", + " 'description': 'Q&A project',\n", + " 'license': 'Unknown',\n", + " 'title': 'curated_stackoverflow_dataset_for_Q_&_A',\n", + " 'topic': 'StackOverflow questions/answers',\n", + " 'urls': ['https://www.kaggle.com/pushpendra7/curated-stackoverflow-dataset-for-q-a/discussion?sortBy=hot&group=upvoted']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c67'),\n", + " 'description': 'part-000xx.lzo - LZO archives with the data '\n", + " '(refer to \"Format\"). part-000xx.lzo.index - '\n", + " 'LZO index files so that the archives are '\n", + " 'splittable in Hadoop. stats.csv.gz - GZIP-ed '\n", + " 'CSV file with some repository statistics '\n", + " 'related to the commits. Format part-000xx - '\n", + " 'text, one line per repository, every line is '\n", + " 'JSON. Date and time format is mostly Go '\n", + " \"language's time.Time.String(), we recommend \"\n", + " 'using dateutil.parse() to parse it with '\n", + " 'Python. Commit message contains explicit \\r'\n", + " ' and symbols in order to be a single line. '\n", + " 'stats.csv has 4 columns: repository name, '\n", + " 'number of commits, number of contributors, '\n", + " 'average length of the commit messages.',\n", + " 'license': 'CC BY-NC',\n", + " 'title': '452M commits on GitHub',\n", + " 'topic': 'GitHub commits',\n", + " 'urls': ['https://zenodo.org/record/285467',\n", + " 'https://data.world/vmarkovtsev/452-m-commits-on-github']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c68'),\n", + " 'description': 'The GitHub Java Corpus is a snapshot of all '\n", + " 'open-source Java code on GitHub in October '\n", + " '2012 that is contained in open-source projects '\n", + " 'that at the time had at least one fork. It '\n", + " 'contains code from 14,785 projects amounting '\n", + " 'to about 352 million lines of code. The '\n", + " 'dataset has been used to study coding practice '\n", + " 'in Java at a large scale.',\n", + " 'license': 'https://creativecommons.org/licenses/by/4.0',\n", + " 'title': 'GitHub Java Corpus',\n", + " 'topic': 'Java projects on GitHub',\n", + " 'urls': ['https://datashare.is.ed.ac.uk/handle/10283/2334',\n", + " 'https://search.datacite.org/works/10.7488/ds/1690']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c69'),\n", + " 'description': '7375 typos developers made in source code '\n", + " 'identifiers, e.g. class names, function names, '\n", + " 'variable names, and fixed them on GitHub. See '\n", + " 'the Origin section about how they were mined.',\n", + " 'license': 'ODC-ODbL',\n", + " 'title': 'typos',\n", + " 'topic': 'Source code typos',\n", + " 'urls': ['https://data.world/source-d/typos']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6a'),\n", + " 'description': '',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stackoverflow question favourites',\n", + " 'topic': 'StackOverflow questions',\n", + " 'urls': ['https://www.kaggle.com/iancuv/stackoverflow-question-favourites']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6b'),\n", + " 'description': '',\n", + " 'license': 'Unknown',\n", + " 'title': 'stackoverflow',\n", + " 'topic': 'StackOverflow',\n", + " 'urls': ['https://www.kaggle.com/pankajkarki/stackoverflow']}\n", + "{'_id': ObjectId('5d8514c46f91803ac7230c6c'),\n", + " 'description': 'On the data team at Stack Overflow, we spend a '\n", + " 'lot of time and energy thinking about tech '\n", + " 'ecosystems and how technologies are related to '\n", + " 'each other. One way to get at this idea of '\n", + " 'relationships between technologies is tag '\n", + " 'correlations, how often technology tags at '\n", + " 'Stack Overflow appear together relative to how '\n", + " 'often they appear separately. One place we see '\n", + " 'developers using tags at Stack Overflow is on '\n", + " 'their Developer Stories, or professional '\n", + " 'profiles/CVs/resumes. If we are interested in '\n", + " 'how technologies are connected and how they '\n", + " \"are used together, developers' own \"\n", + " 'descriptions of their work and careers is a '\n", + " 'great place to get that.',\n", + " 'license': 'https://creativecommons.org/licenses/by-sa/3.0/',\n", + " 'title': 'Stack Overflow Tag Network',\n", + " 'topic': 'StackOverflow tags',\n", + " 'urls': ['https://www.kaggle.com/stackoverflow/stack-overflow-tag-network']}\n" + ] + } + ], + "source": [ + "import pprint\n", + "import pymongo, json\n", + "client = pymongo.MongoClient (host=\"da1.eecs.utk.edu\")\n", + "db = client ['fdac19mp2']\n", + "coll = db ['jball16']\n", + "pp = pprint.PrettyPrinter(indent=1,width=65)\n", + "for r in coll. find():\n", + " print(pp .pformat (r)) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coll.count_documents({})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}