diff --git a/NVDDataFetch-V1.ipynb b/NVDDataFetch-V1.ipynb index 0bcdb34..92e9177 100644 --- a/NVDDataFetch-V1.ipynb +++ b/NVDDataFetch-V1.ipynb @@ -396,6 +396,395 @@ "all_items = pd.read_csv('NVD-Vulnerability-Volumes.csv',index_col=['Publication'],parse_dates=['Publication'],infer_datetime_format=True)\n", "all_items = all_items.sort_index()" ] + }, + { + "cell_type": "markdown", + "id": "12c2b711-94af-4a73-aa8d-775214ee4b1e", + "metadata": {}, + "source": [ + "# NVD API" + ] + }, + { + "cell_type": "markdown", + "id": "9fdf6c73-f5af-49bc-8f91-e10877c5b1b4", + "metadata": {}, + "source": [ + "As an alternative to the above you can use the NVD API, which is the method NVD requests you use and also the one that will be supported in the future. [Info on deprecation timeline here](https://nvd.nist.gov/general/news/api-20-announcements) and [info on the use of the API here](https://nvd.nist.gov/developers/vulnerabilities). API-keys are free adn more information about getting them are available at the urls mentioned. API keys are not necessary, but will increase the rate you are able to pull the data." + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "8b07cace-dd9f-45f4-8916-c0dc66641e71", + "metadata": {}, + "outputs": [], + "source": [ + "file_exists = os.path.exists('./CVE-NVD-API')\n", + "if not file_exists:\n", + " os.mkdir('./CVE-NVD-API')\n", + " os.mkdir('./CVE-NVD-API/JSON')" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "77208840-9fda-48a1-bc64-a2ed7d8dd83d", + "metadata": {}, + "outputs": [], + "source": [ + "#API_key = \"\"\n", + "headers = {'User-Agent': 'I am scraping this data for research purposes. Please do not block. Contact me at '}\n", + "#headers['apiKey'] = API_key # With an API key\n", + "base_url = \"https://services.nvd.nist.gov/rest/json/cves/2.0/\"\n", + "sess = requests.Session()\n", + "sess.headers.update(headers)" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "6716b6d2-520b-421f-a167-22e2be94073a", + "metadata": {}, + "outputs": [], + "source": [ + "info_resp = sess.get(base_url, params = {\"resultsPerPage\":1000, \"startIndex\":0})" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "79bb3d66-75da-40bd-b88c-593067dcffa6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total results 226582\n" + ] + } + ], + "source": [ + "if info_resp.status_code == 200:\n", + " d = json.loads(info_resp.text)\n", + " total_results = d.get('totalResults', None)\n", + " if total_results is None:\n", + " print(\"Problem fetching total results\")\n", + " else:\n", + " print(\"Total results \" + str(total_results))\n", + "else:\n", + " print(\"API failed to respond. Status code: \" + str(info_resp.status_code))" + ] + }, + { + "cell_type": "markdown", + "id": "b7b99e56-86b3-414d-8c39-8f65d6971689", + "metadata": {}, + "source": [ + "THe [API best practices](https://nvd.nist.gov/developers/start-here) indicate that there is a 5 requests in a rolling 30second window for non-API key requests and 50 in a rolling 30 second window for API validated. We'll then want to sleep between each request a little bit more than 6 seconds (or .6 if you have an API key), just to ensure we are not overloading them. At this rate it will take more than 35 minutes. If you have an API key, it'll be a little more than 3.5 minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "b44db69a-cb70-40b7-90f0-a4530cbdd995", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "731461be-3dd7-4b98-b790-e20b302854ee", + "metadata": {}, + "outputs": [], + "source": [ + "rpp = 1000 # Technically this is 5000, but may be timed out at lower levels\n", + "start_is = range(0, total_results, rpp)\n", + "params = {\"resultsPerPage\":rpp}\n", + "time_delay = 6.1" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "8390964c-02b7-4d64-a2fd-2e893f17780b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Already scraped index 0\n", + "Already scraped index 1000\n", + "Already scraped index 2000\n", + "Already scraped index 3000\n", + "Already scraped index 4000\n", + "Already scraped index 5000\n", + "Already scraped index 6000\n", + "Already scraped index 7000\n", + "Already scraped index 8000\n", + "Already scraped index 9000\n", + "Already scraped index 10000\n", + "Already scraped index 11000\n", + "Already scraped index 12000\n", + "Already scraped index 13000\n", + "Already scraped index 14000\n", + "Already scraped index 15000\n", + "Already scraped index 16000\n", + "Already scraped index 17000\n", + "Already scraped index 18000\n", + "Already scraped index 19000\n", + "Already scraped index 20000\n", + "Already scraped index 21000\n", + "Already scraped index 22000\n", + "Already scraped index 23000\n", + "Already scraped index 24000\n", + "Already scraped index 25000\n", + "Already scraped index 26000\n", + "Already scraped index 27000\n", + "Already scraped index 28000\n", + "Already scraped index 29000\n", + "Already scraped index 30000\n", + "Already scraped index 31000\n", + "Already scraped index 32000\n", + "Already scraped index 33000\n", + "Already scraped index 34000\n", + "Already scraped index 35000\n", + "Already scraped index 36000\n", + "Already scraped index 37000\n", + "Already scraped index 38000\n", + "Already scraped index 39000\n", + "Already scraped index 40000\n", + "Already scraped index 41000\n", + "Already scraped index 42000\n", + "Already scraped index 43000\n", + "Already scraped index 44000\n", + "Already scraped index 45000\n", + "Already scraped index 46000\n", + "Already scraped index 47000\n", + "Already scraped index 48000\n", + "Already scraped index 49000\n", + "Already scraped index 50000\n", + "Already scraped index 51000\n", + "Already scraped index 52000\n", + "Already scraped index 53000\n", + "Already scraped index 54000\n", + "Already scraped index 55000\n", + "Already scraped index 56000\n", + "Already scraped index 57000\n", + "Already scraped index 58000\n", + "Already scraped index 59000\n", + "Already scraped index 60000\n", + "Already scraped index 61000\n", + "Already scraped index 62000\n", + "Already scraped index 63000\n", + "Already scraped index 64000\n", + "Already scraped index 65000\n", + "Already scraped index 66000\n", + "Already scraped index 67000\n", + "Already scraped index 68000\n", + "Already scraped index 69000\n", + "Already scraped index 70000\n", + "Already scraped index 71000\n", + "Already scraped index 72000\n", + "Already scraped index 73000\n", + "Already scraped index 74000\n", + "Already scraped index 75000\n", + "Already scraped index 76000\n", + "Already scraped index 77000\n", + "Already scraped index 78000\n", + "Already scraped index 79000\n", + "Already scraped index 80000\n", + "Already scraped index 81000\n", + "Already scraped index 82000\n", + "Already scraped index 83000\n", + "Already scraped index 84000\n", + "Already scraped index 85000\n", + "Already scraped index 86000\n", + "Already scraped index 87000\n", + "Already scraped index 88000\n", + "Already scraped index 89000\n", + "Already scraped index 90000\n", + "Already scraped index 91000\n", + "Already scraped index 92000\n", + "Already scraped index 93000\n", + "Already scraped index 94000\n", + "Already scraped index 95000\n", + "Already scraped index 96000\n", + "Already scraped index 97000\n", + "Already scraped index 98000\n", + "Already scraped index 99000\n", + "Already scraped index 100000\n", + "Already scraped index 101000\n", + "Already scraped index 102000\n", + "Already scraped index 103000\n", + "Already scraped index 104000\n", + "Already scraped index 105000\n", + "Already scraped index 106000\n", + "Already scraped index 107000\n", + "Already scraped index 108000\n", + "Already scraped index 109000\n", + "Already scraped index 110000\n", + "Already scraped index 111000\n", + "Already scraped index 112000\n", + "Already scraped index 113000\n", + "Already scraped index 114000\n", + "Already scraped index 115000\n", + "Already scraped index 116000\n", + "Already scraped index 117000\n", + "Already scraped index 118000\n", + "Already scraped index 119000\n", + "Already scraped index 120000\n", + "Already scraped index 121000\n", + "Already scraped index 122000\n", + "Already scraped index 123000\n", + "Already scraped index 124000\n", + "Already scraped index 125000\n", + "Already scraped index 126000\n", + "Already scraped index 127000\n", + "Already scraped index 128000\n", + "Already scraped index 129000\n", + "Already scraped index 130000\n", + "Already scraped index 131000\n", + "Already scraped index 132000\n", + "Already scraped index 133000\n", + "Already scraped index 134000\n", + "Already scraped index 135000\n", + "Already scraped index 136000\n", + "Already scraped index 137000\n", + "Already scraped index 138000\n", + "Already scraped index 139000\n", + "Already scraped index 140000\n", + "Already scraped index 141000\n", + "Already scraped index 142000\n", + "Already scraped index 143000\n", + "Already scraped index 144000\n", + "Already scraped index 145000\n", + "Already scraped index 146000\n", + "Already scraped index 147000\n", + "Already scraped index 148000\n", + "Already scraped index 149000\n", + "Already scraped index 150000\n", + "Already scraped index 151000\n", + "Already scraped index 152000\n", + "Already scraped index 153000\n", + "Already scraped index 154000\n", + "Already scraped index 155000\n", + "Already scraped index 156000\n", + "Already scraped index 157000\n", + "Already scraped index 158000\n", + "Already scraped index 159000\n", + "Already scraped index 160000\n", + "Already scraped index 161000\n", + "Already scraped index 162000\n", + "Already scraped index 163000\n", + "Already scraped index 164000\n", + "Already scraped index 165000\n", + "Already scraped index 166000\n", + "Already scraped index 167000\n", + "Already scraped index 168000\n", + "Already scraped index 169000\n", + "Already scraped index 170000\n", + "Already scraped index 171000\n", + "Already scraped index 172000\n", + "Already scraped index 173000\n", + "Already scraped index 174000\n", + "Already scraped index 175000\n", + "Already scraped index 176000\n", + "Already scraped index 177000\n", + "Already scraped index 178000\n", + "Already scraped index 179000\n", + "Already scraped index 180000\n", + "Already scraped index 181000\n", + "Already scraped index 182000\n", + "Already scraped index 183000\n", + "Already scraped index 184000\n", + "Already scraped index 185000\n", + "Already scraped index 186000\n", + "Already scraped index 187000\n", + "Already scraped index 188000\n", + "Already scraped index 189000\n", + "Already scraped index 190000\n", + "Already scraped index 191000\n", + "Already scraped index 192000\n", + "Already scraped index 193000\n", + "Already scraped index 194000\n", + "Already scraped index 195000\n", + "Already scraped index 196000\n", + "Already scraped index 197000\n", + "Already scraped index 198000\n", + "Already scraped index 199000\n", + "Already scraped index 200000\n", + "Already scraped index 201000\n", + "Already scraped index 202000\n", + "Already scraped index 203000\n", + "Already scraped index 204000\n", + "Already scraped index 205000\n", + "Already scraped index 206000\n", + "Already scraped index 207000\n", + "Already scraped index 208000\n", + "Already scraped index 209000\n", + "Already scraped index 210000\n", + "Already scraped index 211000\n", + "Already scraped index 212000\n", + "Already scraped index 213000\n", + "Already scraped index 214000\n", + "Already scraped index 215000\n", + "Already scraped index 216000\n", + "Already scraped index 217000\n", + "Already scraped index 218000\n", + "Already scraped index 219000\n", + "Already scraped index 220000\n", + "Already scraped index 221000\n", + "Already scraped index 222000\n", + "Already scraped index 223000\n", + "Already scraped index 224000\n", + "Already scraped index 225000\n", + "Already scraped index 226000\n" + ] + } + ], + "source": [ + "for i, start_i in enumerate(start_is):\n", + " fname = './CVE-NVD-API/JSON/' + str(start_i) + \"_\" + str(rpp) + '.json'\n", + " if os.path.isfile(fname):\n", + " print(\"Already scraped index \" + str(start_i))\n", + " continue\n", + " else:\n", + " params['startIndex'] = start_i\n", + " try:\n", + " resp = sess.get(base_url, params=params)\n", + " except requests.Timeout:\n", + " print(\"Request timed out for \" + str(start_i))\n", + " continue\n", + " if resp.status_code != 200:\n", + " print(\"Failed to get \" + str(start_i) + \" with status code \" + str(resp.status_code))\n", + " continue\n", + " try:\n", + " d = json.loads(resp.text)\n", + " except:\n", + " print(\"failed to load json \" + str(start_i))\n", + " continue\n", + " with open(fname, 'w') as f:\n", + " json.dump(d, f)\n", + " print(\"Successfully saved \" + str(start_i))\n", + " loop_delay = random.uniform(time_delay,time_delay*1.1)\n", + " print(\"Waiting \" + str(loop_delay) + \" seconds\")\n", + " time.sleep(loop_delay)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "726ce0f2-a026-4c32-8630-42eb88888f0f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -418,7 +807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.15" } }, "nbformat": 4,