{
"cells": [
{
"cell_type": "markdown",
"id": "61c8a73e",
"metadata": {},
"source": [
"# CS 200: Requests\n",
"\n",
""
]
},
{
"cell_type": "markdown",
"id": "b306da8a",
"metadata": {},
"source": [
"The requests
module is an alternative to the urllib
module.\n",
"See Requests: HTTP for Humans for details.\n",
"\n",
"Here is a quick example."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "fc9c4602",
"metadata": {},
"outputs": [],
"source": [
"import requests"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "875b2bb1",
"metadata": {},
"outputs": [],
"source": [
"r = requests.get(\"https://zoo.cs.yale.edu/classes/cs200/index.html\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "936b30dc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.status_code"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4aea6a41",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['__attrs__',\n",
" '__bool__',\n",
" '__class__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__dir__',\n",
" '__doc__',\n",
" '__enter__',\n",
" '__eq__',\n",
" '__exit__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__getstate__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__init_subclass__',\n",
" '__iter__',\n",
" '__le__',\n",
" '__lt__',\n",
" '__module__',\n",
" '__ne__',\n",
" '__new__',\n",
" '__nonzero__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__setstate__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" '_content',\n",
" '_content_consumed',\n",
" '_next',\n",
" 'apparent_encoding',\n",
" 'close',\n",
" 'connection',\n",
" 'content',\n",
" 'cookies',\n",
" 'elapsed',\n",
" 'encoding',\n",
" 'headers',\n",
" 'history',\n",
" 'is_permanent_redirect',\n",
" 'is_redirect',\n",
" 'iter_content',\n",
" 'iter_lines',\n",
" 'json',\n",
" 'links',\n",
" 'next',\n",
" 'ok',\n",
" 'raise_for_status',\n",
" 'raw',\n",
" 'reason',\n",
" 'request',\n",
" 'status_code',\n",
" 'text',\n",
" 'url']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(r)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "deec028c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"apparent_encoding \t\t ascii \n",
"\n",
"close \t\t > \n",
"\n",
"connection \t\t \n",
"\n",
"content \t\t b'\\n\\nCPSC 200 - Introduction to Information Systems\\n \\n\\n\\n\\n CPSC 200 - Introduction to Information Systems
\\n FALL 2022
\\n
\\n\\n\\n
\\n \\n[Home] \\n[Syllabus] \\n[Contact Info] \\n[Announcements]\\n[Lectures]\\n[Assignments]\\n[Computer Science Department]\\n
\\n\\n\\n\\n\\n\\nCourse Information
\\n\\n- \\n\\nGoogle\\'s Python Class,\\na gentle introduction to Python. \\nProgramming exercises available on the zoo at\\n/c/cs200/\\n\\n
- \\nPython.org. Everything you need online.\\n\\n\\n
- \\n Canvas course site, including Discussion.\\n\\n
- CS 200 Jupyter Notebooks\\n\\n
- \\nPython module of the week A guide to many, many\\nstandard, tested and available Python modules.\\n\\n\\n
- \\n\\ncs200help@cs.yale.edu.\\nEmail alias to reach all instructional staff members with a question.\\n\\n
- \\n\\nZoo tutorial, Spring 2014 edition.\\n\\n
- \\n\\nThe Zoo and the Zoo Annex, including instructions for\\nremote access to the Zoo, courtesy of Prof. Aspnes.\\n\\n
- Slade\\'s gentle introduction to UNIX.\\n
- \\n\\n Instructions for the submit script.\\n\\nBetter instructions for the submit script.\\n\\n
- \\n\\nSubmitting Homework Remotely.\\n\\n\\n
- \\n\\nPython style guide.\\n\\n\\n\\n\\n
\\n\\n - \\nMidterm 1: Thursday October 13, 7pm. WLH 207.\\n\\n
- \\nMidterm 2: Thursday November 10, 7pm. WLH 207.\\n\\n
- \\nFinal exam: Monday December 19, 9am. RTBA.\\n\\n
\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n' \n",
"\n",
"cookies \t\t \n",
"\n",
"elapsed \t\t 0:00:00.089930 \n",
"\n",
"encoding \t\t UTF-8 \n",
"\n",
"headers \t\t {'Date': 'Mon, 19 Sep 2022 20:59:51 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '\"a81-5e907fd36a9ec\"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'} \n",
"\n",
"history \t\t [] \n",
"\n",
"is_permanent_redirect \t\t False \n",
"\n",
"is_redirect \t\t False \n",
"\n",
"iter_content \t\t > \n",
"\n",
"iter_lines \t\t > \n",
"\n",
"json \t\t > \n",
"\n",
"links \t\t {} \n",
"\n",
"next \t\t None \n",
"\n",
"ok \t\t True \n",
"\n",
"raise_for_status \t\t > \n",
"\n",
"raw \t\t \n",
"\n",
"reason \t\t OK \n",
"\n",
"request \t\t \n",
"\n",
"status_code \t\t 200 \n",
"\n",
"text \t\t \n",
"\n",
"CPSC 200 - Introduction to Information Systems\n",
" \n",
"\n",
"\n",
"\n",
" CPSC 200 - Introduction to Information Systems
\n",
" FALL 2022
\n",
"
\n",
"\n",
"\n",
"
\n",
" \n",
"[Home] \n",
"[Syllabus] \n",
"[Contact Info] \n",
"[Announcements]\n",
"[Lectures]\n",
"[Assignments]\n",
"[Computer Science Department]\n",
"
\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"Course Information
\n",
"\n",
"- \n",
"\n",
"Google's Python Class,\n",
"a gentle introduction to Python. \n",
"Programming exercises available on the zoo at\n",
"/c/cs200/\n",
"\n",
"
- \n",
"Python.org. Everything you need online.\n",
"\n",
"\n",
"
- \n",
" Canvas course site, including Discussion.\n",
"\n",
"
- CS 200 Jupyter Notebooks\n",
"\n",
"
- \n",
"Python module of the week A guide to many, many\n",
"standard, tested and available Python modules.\n",
"\n",
"\n",
"
- \n",
"\n",
"cs200help@cs.yale.edu.\n",
"Email alias to reach all instructional staff members with a question.\n",
"\n",
"
- \n",
"\n",
"Zoo tutorial, Spring 2014 edition.\n",
"\n",
"
- \n",
"\n",
"The Zoo and the Zoo Annex, including instructions for\n",
"remote access to the Zoo, courtesy of Prof. Aspnes.\n",
"\n",
"
- Slade's gentle introduction to UNIX.\n",
"
- \n",
"\n",
" Instructions for the submit script.\n",
"\n",
"Better instructions for the submit script.\n",
"\n",
"
- \n",
"\n",
"Submitting Homework Remotely.\n",
"\n",
"\n",
"
- \n",
"\n",
"Python style guide.\n",
"\n",
"\n",
"\n",
"\n",
"
\n",
"\n",
" - \n",
"Midterm 1: Thursday October 13, 7pm. WLH 207.\n",
"\n",
"
- \n",
"Midterm 2: Thursday November 10, 7pm. WLH 207.\n",
"\n",
"
- \n",
"Final exam: Monday December 19, 9am. RTBA.\n",
"\n",
"
\n",
"\n",
"
\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" \n",
"\n",
"url \t\t https://zoo.cs.yale.edu/classes/cs200/index.html \n",
"\n"
]
}
],
"source": [
"for prop in dir(r):\n",
" if prop.startswith(\"_\"):\n",
" pass\n",
" else:\n",
" print (prop, '\\t\\t', getattr(r,prop), '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8e46ba0d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'UTF-8'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.encoding"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2b5bc24e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6\n",
"6\n",
"61\n",
"69\n",
"7\n",
"6\n",
"4\n",
"61\n",
"24\n",
"5\n",
"0\n",
"17\n",
"4\n",
"26\n",
"32\n",
"39\n",
"42\n",
"48\n",
"38\n",
"44\n",
"64\n",
"4\n",
"21\n",
"0\n",
"68\n",
"0\n",
"0\n",
"27\n",
"4\n",
"4\n",
"52\n",
"26\n",
"33\n",
"45\n",
"9\n",
"0\n",
"48\n",
"43\n",
"0\n",
"0\n",
"57\n",
"11\n",
"49\n",
"0\n",
"73\n",
"0\n",
"48\n",
"51\n",
"46\n",
"0\n",
"0\n",
"5\n",
"39\n",
"26\n",
"69\n",
"0\n",
"5\n",
"40\n",
"38\n",
"0\n",
"4\n",
"73\n",
"57\n",
"51\n",
"0\n",
"103\n",
"5\n",
"44\n",
"41\n",
"37\n",
"46\n",
"0\n",
"4\n",
"37\n",
"33\n",
"0\n",
"4\n",
"4\n",
"34\n",
"27\n",
"3\n",
"4\n",
"62\n",
"23\n",
"0\n",
"0\n",
"0\n",
"0\n",
"4\n",
"0\n",
"4\n",
"46\n",
"0\n",
"4\n",
"46\n",
"0\n",
"4\n",
"42\n",
"0\n",
"5\n",
"0\n",
"4\n",
"68\n",
"0\n",
"30\n",
"57\n",
"9\n",
"0\n",
"0\n",
"7\n",
"7\n",
"0\n"
]
}
],
"source": [
"for line in r.iter_lines():\n",
" print (len(line))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "88519f86",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://zoo.cs.yale.edu/classes/cs200/index.html'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.url"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5a1235b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"str"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(r.text)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "80958135",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"bytes"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(r.content)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5b00b1c2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2689"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(r.text)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "e8556ee8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n\\nCPSC 200 - Introduc'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.text[:40]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "38b9ec85",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'OK'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.reason"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d1de646e",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "485a5e36",
"metadata": {},
"outputs": [],
"source": [
"matches = re.findall('href=',r.text)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "dd920cea",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(matches)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4f8068c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"requests.codes.ok"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "6517e11f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Date': 'Mon, 19 Sep 2022 20:59:51 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '\"a81-5e907fd36a9ec\"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'}"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r.headers"
]
},
{
"cell_type": "markdown",
"id": "c93286bd",
"metadata": {},
"source": [
"## Beautiful Soup\n",
"\n",
"See https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)\n",
"\n",
"Also, see BeautifulSoup tutorial"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5f1d8e8a",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from urllib.request import urlopen"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2bcfad21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n",
"#mw-head\n",
"#searchInput\n",
"/wiki/Wikipedia\n",
"/wiki/Free_content\n",
"/wiki/Encyclopedia\n",
"/wiki/Help:Introduction_to_Wikipedia\n",
"/wiki/Special:Statistics\n",
"/wiki/English_language\n",
"/wiki/File:Queen_Elizabeth_II_in_March_2015.jpg\n",
"/wiki/Elizabeth_II\n",
"/wiki/Monarchy_of_the_United_Kingdom\n",
"/wiki/Commonwealth_realm\n",
"/wiki/Duke_of_York\n",
"/wiki/George_VI\n",
"/wiki/Queen_Elizabeth_The_Queen_Mother\n",
"/wiki/Heir_presumptive\n",
"/wiki/Abdication_of_Edward_VIII\n",
"/wiki/Edward_VIII\n",
"/wiki/Auxiliary_Territorial_Service\n",
"/wiki/Prince_Philip,_Duke_of_Edinburgh\n",
"/wiki/Death_and_funeral_of_Prince_Philip,_Duke_of_Edinburgh\n",
"/wiki/List_of_monarchs_in_Britain_by_length_of_reign\n",
"/wiki/Death_of_Diana,_Princess_of_Wales\n",
"/wiki/Diana,_Princess_of_Wales\n",
"/wiki/Death_and_state_funeral_of_Elizabeth_II\n",
"/wiki/Balmoral_Castle\n",
"/wiki/Charles_III\n",
"/wiki/Elizabeth_II\n",
"/wiki/Rotavirus\n",
"/wiki/NASA_Astronaut_Group_2\n",
"/wiki/September_2019_events_in_the_U.S._repo_market\n",
"/wiki/Wikipedia:Today%27s_featured_article/September_2022\n",
"https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/\n",
"/wiki/Wikipedia:Featured_articles\n",
"/wiki/Wikipedia:About_Today%27s_featured_article\n",
"/wiki/File:Funeral_of_Edward_VII_-1910_-cropped.JPG\n",
"/wiki/Royal_Navy_State_Funeral_Gun_Carriage\n",
"/wiki/State_funeral_of_Queen_Victoria\n",
"/wiki/Westminster_Abbey\n",
"/wiki/James_O%27Donnell_(organist)\n",
"/wiki/Death_and_state_funeral_of_Elizabeth_II\n",
"/wiki/State_visit_by_Elizabeth_II_to_Spain\n",
"/wiki/Douglas_Chandor\n",
"/wiki/State_hearse\n",
"/wiki/Leverton_%26_Sons\n",
"/wiki/Funeral_directors_to_the_Royal_Household\n",
"/wiki/Windsor_Castle\n",
"/wiki/Shaw_Farm,_Windsor\n",
"/wiki/Queen%27s_Road_East_(song)\n",
"/wiki/Lo_Ta-yu\n",
"/wiki/Wikipedia:Recent_additions\n",
"/wiki/Help:Your_first_article\n",
"/wiki/Template_talk:Did_you_know\n",
"/wiki/File:Jason_Sudeikis_South_by_Southwest_2019_(cropped).jpg\n",
"/wiki/2022_Kyrgyzstan%E2%80%93Tajikistan_clashes\n",
"/wiki/2022_Swedish_general_election\n",
"/wiki/Sweden_Democrats\n",
"/wiki/Moderate_Party\n",
"/wiki/Christian_Democrats_(Sweden)\n",
"/wiki/Liberals_(Sweden)\n",
"/wiki/Riksdag\n",
"/wiki/Ted_Lasso\n",
"/wiki/Jason_Sudeikis\n",
"/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series\n",
"/wiki/Succession_(TV_series)\n",
"/wiki/Primetime_Emmy_Award_for_Outstanding_Drama_Series\n",
"/wiki/74th_Primetime_Emmy_Awards\n",
"/wiki/Jean-Luc_Godard\n",
"/wiki/Portal:Current_events\n",
"/wiki/Death_and_state_funeral_of_Elizabeth_II\n",
"/wiki/2022_Russian_invasion_of_Ukraine\n",
"/wiki/Deaths_in_2022\n",
"/wiki/John_Stearns\n",
"/wiki/Harry_Booth_(coach)\n",
"/wiki/Naresh_Kumar_(tennis)\n",
"/wiki/Shelby_Jordan\n",
"/wiki/Michael_DeGroote\n",
"/wiki/Eddie_Butler_(rugby_union)\n",
"/wiki/Wikipedia:In_the_news/Candidates\n",
"/wiki/September_19\n",
"/wiki/International_Talk_Like_a_Pirate_Day\n",
"/wiki/File:Plantagenet,_Edward,_The_Black_Prince,_Iconic_Image.JPG\n",
"/wiki/1356\n",
"/wiki/Hundred_Years%27_War\n",
"/wiki/Edward_the_Black_Prince\n",
"/wiki/Battle_of_Poitiers\n",
"/wiki/John_II_of_France\n",
"/wiki/1944\n",
"/wiki/World_War_II\n",
"/wiki/Moscow_Armistice\n",
"/wiki/Continuation_War\n",
"/wiki/1950\n",
"/wiki/Korean_War\n",
"/wiki/Battle_of_Nam_River\n",
"/wiki/1985\n",
"/wiki/1985_Mexico_City_earthquake\n",
"/wiki/Moment_magnitude_scale\n",
"/wiki/Mexico_City\n",
"/wiki/2011\n",
"/wiki/Mariano_Rivera\n",
"/wiki/Trevor_Hoffman\n",
"/wiki/Major_League_Baseball\n",
"/wiki/Save_(baseball)\n",
"/wiki/Leo_VI_the_Wise\n",
"/wiki/Mabel_Vernon\n",
"/wiki/Ashot_Nadanian\n",
"/wiki/September_18\n",
"/wiki/September_19\n",
"/wiki/September_20\n",
"/wiki/Wikipedia:Selected_anniversaries/September\n",
"https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/\n",
"/wiki/List_of_days_of_the_year\n",
"/wiki/File:The_Good_Place_careta.png\n",
"/wiki/The_Good_Place\n",
"/wiki/Fantasy_television\n",
"/wiki/Television_comedy\n",
"/wiki/List_of_The_Good_Place_episodes\n",
"/wiki/Michael_Schur\n",
"/wiki/NBC\n",
"/wiki/Kristen_Bell\n",
"/wiki/Afterlife\n",
"/wiki/Ted_Danson\n",
"/wiki/Heaven\n",
"/wiki/Utopia\n",
"/wiki/Righteousness\n",
"/wiki/Pitch_(filmmaking)\n",
"/wiki/The_Good_Place_(season_1)\n",
"/wiki/The_Good_Place_(season_2)\n",
"/wiki/The_Good_Place_(season_3)\n",
"/wiki/The_Good_Place_(season_4)\n",
"/wiki/Primetime_Emmy_Awards\n",
"/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series\n",
"/wiki/Shout!_Factory\n",
"/wiki/List_of_The_Good_Place_episodes\n",
"/wiki/List_of_songs_recorded_by_Kyla\n",
"/wiki/List_of_birds_of_Ontario\n",
"/wiki/List_of_accolades_received_by_The_Mandalorian\n",
"/wiki/Wikipedia:Today%27s_featured_list/September_2022\n",
"/wiki/Wikipedia:Featured_lists\n",
"/wiki/File:Hrh_Princess_Elizabeth_in_the_Auxiliary_Territorial_Service,_April_1945_TR2832.jpg\n",
"/wiki/Women_in_World_War_II\n",
"/wiki/Home_front_during_World_War_II\n",
"/wiki/Elizabeth_II\n",
"/wiki/Subaltern_(military)\n",
"/wiki/Auxiliary_Territorial_Service\n",
"/wiki/British_Army\n",
"/wiki/Ministry_of_Information_(United_Kingdom)\n",
"/wiki/User:Angerey\n",
"/wiki/Template:POTD/2022-09-18\n",
"/wiki/Template:POTD/2022-09-17\n",
"/wiki/Template:POTD/2022-09-16\n",
"/wiki/Wikipedia:Picture_of_the_day/Archive\n",
"/wiki/Wikipedia:Featured_pictures\n",
"/wiki/Wikipedia:Community_portal\n",
"/wiki/Wikipedia:Village_pump\n",
"/wiki/Wikipedia:News\n",
"/wiki/Wikipedia:Teahouse\n",
"/wiki/Wikipedia:Help_desk\n",
"/wiki/Wikipedia:Reference_desk\n",
"/wiki/Wikipedia:Contents/Portals\n",
"/wiki/Wikimedia_Foundation\n",
"https://wikimediafoundation.org/our-work/wikimedia-projects/\n",
"https://commons.wikimedia.org/wiki/\n",
"https://commons.wikimedia.org/wiki/\n",
"https://www.mediawiki.org/wiki/\n",
"https://www.mediawiki.org/wiki/\n",
"https://meta.wikimedia.org/wiki/\n",
"https://meta.wikimedia.org/wiki/\n",
"https://en.wikibooks.org/wiki/\n",
"https://en.wikibooks.org/wiki/\n",
"https://www.wikidata.org/wiki/\n",
"https://www.wikidata.org/wiki/\n",
"https://en.wikinews.org/wiki/\n",
"https://en.wikinews.org/wiki/\n",
"https://en.wikiquote.org/wiki/\n",
"https://en.wikiquote.org/wiki/\n",
"https://en.wikisource.org/wiki/\n",
"https://en.wikisource.org/wiki/\n",
"https://species.wikimedia.org/wiki/\n",
"https://species.wikimedia.org/wiki/\n",
"https://en.wikiversity.org/wiki/\n",
"https://en.wikiversity.org/wiki/\n",
"https://en.wikivoyage.org/wiki/\n",
"https://en.wikivoyage.org/wiki/\n",
"https://en.wiktionary.org/wiki/\n",
"https://en.wiktionary.org/wiki/\n",
"/wiki/English_language\n",
"https://meta.wikimedia.org/wiki/List_of_Wikipedias\n",
"https://ar.wikipedia.org/wiki/\n",
"https://de.wikipedia.org/wiki/\n",
"https://es.wikipedia.org/wiki/\n",
"https://fr.wikipedia.org/wiki/\n",
"https://it.wikipedia.org/wiki/\n",
"https://nl.wikipedia.org/wiki/\n",
"https://ja.wikipedia.org/wiki/\n",
"https://pl.wikipedia.org/wiki/\n",
"https://pt.wikipedia.org/wiki/\n",
"https://ru.wikipedia.org/wiki/\n",
"https://sv.wikipedia.org/wiki/\n",
"https://uk.wikipedia.org/wiki/\n",
"https://vi.wikipedia.org/wiki/\n",
"https://zh.wikipedia.org/wiki/\n",
"https://id.wikipedia.org/wiki/\n",
"https://ms.wikipedia.org/wiki/\n",
"https://zh-min-nan.wikipedia.org/wiki/\n",
"https://bg.wikipedia.org/wiki/\n",
"https://ca.wikipedia.org/wiki/\n",
"https://cs.wikipedia.org/wiki/\n",
"https://da.wikipedia.org/wiki/\n",
"https://eo.wikipedia.org/wiki/\n",
"https://eu.wikipedia.org/wiki/\n",
"https://fa.wikipedia.org/wiki/\n",
"https://he.wikipedia.org/wiki/\n",
"https://ko.wikipedia.org/wiki/\n",
"https://hu.wikipedia.org/wiki/\n",
"https://no.wikipedia.org/wiki/\n",
"https://ro.wikipedia.org/wiki/\n",
"https://sr.wikipedia.org/wiki/\n",
"https://sh.wikipedia.org/wiki/\n",
"https://fi.wikipedia.org/wiki/\n",
"https://tr.wikipedia.org/wiki/\n",
"https://ast.wikipedia.org/wiki/\n",
"https://bn.wikipedia.org/wiki/\n",
"https://bs.wikipedia.org/wiki/\n",
"https://et.wikipedia.org/wiki/\n",
"https://el.wikipedia.org/wiki/\n",
"https://simple.wikipedia.org/wiki/\n",
"https://gl.wikipedia.org/wiki/\n",
"https://hr.wikipedia.org/wiki/\n",
"https://lv.wikipedia.org/wiki/\n",
"https://lt.wikipedia.org/wiki/\n",
"https://ml.wikipedia.org/wiki/\n",
"https://mk.wikipedia.org/wiki/\n",
"https://nn.wikipedia.org/wiki/\n",
"https://sq.wikipedia.org/wiki/\n",
"https://sk.wikipedia.org/wiki/\n",
"https://sl.wikipedia.org/wiki/\n",
"https://th.wikipedia.org/wiki/\n",
"https://en.wikipedia.org/w/index.php?title=Main_Page&oldid=1108085777\n",
"/wiki/Special:MyTalk\n",
"/wiki/Special:MyContributions\n",
"/w/index.php?title=Special:CreateAccount&returnto=Main+Page\n",
"/w/index.php?title=Special:UserLogin&returnto=Main+Page\n",
"/wiki/Main_Page\n",
"/wiki/Talk:Main_Page\n",
"/wiki/Main_Page\n",
"/w/index.php?title=Main_Page&action=edit\n",
"/w/index.php?title=Main_Page&action=history\n",
"/wiki/Main_Page\n",
"/wiki/Main_Page\n",
"/wiki/Wikipedia:Contents\n",
"/wiki/Portal:Current_events\n",
"/wiki/Special:Random\n",
"/wiki/Wikipedia:About\n",
"//en.wikipedia.org/wiki/Wikipedia:Contact_us\n",
"https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en\n",
"/wiki/Help:Contents\n",
"/wiki/Help:Introduction\n",
"/wiki/Wikipedia:Community_portal\n",
"/wiki/Special:RecentChanges\n",
"/wiki/Wikipedia:File_Upload_Wizard\n",
"/wiki/Special:WhatLinksHere/Main_Page\n",
"/wiki/Special:RecentChangesLinked/Main_Page\n",
"/wiki/Wikipedia:File_Upload_Wizard\n",
"/wiki/Special:SpecialPages\n",
"/w/index.php?title=Main_Page&oldid=1108085777\n",
"/w/index.php?title=Main_Page&action=info\n",
"/w/index.php?title=Special:CiteThisPage&page=Main_Page&id=1108085777&wpFormIdentifier=titleform\n",
"https://www.wikidata.org/wiki/Special:EntityPage/Q5296\n",
"/w/index.php?title=Special:DownloadAsPdf&page=Main_Page&action=show-download-screen\n",
"/w/index.php?title=Main_Page&printable=yes\n",
"https://commons.wikimedia.org/wiki/Main_Page\n",
"https://www.mediawiki.org/wiki/MediaWiki\n",
"https://meta.wikimedia.org/wiki/Main_Page\n",
"https://wikisource.org/wiki/Main_Page\n",
"https://species.wikimedia.org/wiki/Main_Page\n",
"https://en.wikibooks.org/wiki/Main_Page\n",
"https://www.wikidata.org/wiki/Wikidata:Main_Page\n",
"https://wikimania.wikimedia.org/wiki/2022:Wikimania\n",
"https://en.wikinews.org/wiki/Main_Page\n",
"https://en.wikiquote.org/wiki/Main_Page\n",
"https://en.wikisource.org/wiki/Main_Page\n",
"https://en.wikiversity.org/wiki/Wikiversity:Main_Page\n",
"https://en.wikivoyage.org/wiki/Main_Page\n",
"https://en.wiktionary.org/wiki/Wiktionary:Main_Page\n",
"https://ar.wikipedia.org/wiki/\n",
"https://bn.wikipedia.org/wiki/\n",
"https://bg.wikipedia.org/wiki/\n",
"https://bs.wikipedia.org/wiki/\n",
"https://ca.wikipedia.org/wiki/\n",
"https://cs.wikipedia.org/wiki/\n",
"https://da.wikipedia.org/wiki/\n",
"https://de.wikipedia.org/wiki/\n",
"https://et.wikipedia.org/wiki/\n",
"https://el.wikipedia.org/wiki/\n",
"https://es.wikipedia.org/wiki/\n",
"https://eo.wikipedia.org/wiki/\n",
"https://eu.wikipedia.org/wiki/\n",
"https://fa.wikipedia.org/wiki/\n",
"https://fr.wikipedia.org/wiki/\n",
"https://gl.wikipedia.org/wiki/\n",
"https://ko.wikipedia.org/wiki/\n",
"https://hr.wikipedia.org/wiki/\n",
"https://id.wikipedia.org/wiki/\n",
"https://it.wikipedia.org/wiki/\n",
"https://he.wikipedia.org/wiki/\n",
"https://ka.wikipedia.org/wiki/\n",
"https://lv.wikipedia.org/wiki/\n",
"https://lt.wikipedia.org/wiki/\n",
"https://hu.wikipedia.org/wiki/\n",
"https://mk.wikipedia.org/wiki/\n",
"https://ms.wikipedia.org/wiki/\n",
"https://nl.wikipedia.org/wiki/\n",
"https://ja.wikipedia.org/wiki/\n",
"https://no.wikipedia.org/wiki/\n",
"https://nn.wikipedia.org/wiki/\n",
"https://pl.wikipedia.org/wiki/\n",
"https://pt.wikipedia.org/wiki/\n",
"https://ro.wikipedia.org/wiki/\n",
"https://ru.wikipedia.org/wiki/\n",
"https://simple.wikipedia.org/wiki/\n",
"https://sk.wikipedia.org/wiki/\n",
"https://sl.wikipedia.org/wiki/\n",
"https://sr.wikipedia.org/wiki/\n",
"https://sh.wikipedia.org/wiki/\n",
"https://fi.wikipedia.org/wiki/\n",
"https://sv.wikipedia.org/wiki/\n",
"https://th.wikipedia.org/wiki/\n",
"https://tr.wikipedia.org/wiki/\n",
"https://uk.wikipedia.org/wiki/\n",
"https://vi.wikipedia.org/wiki/\n",
"https://zh.wikipedia.org/wiki/\n",
"//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License\n",
"//creativecommons.org/licenses/by-sa/3.0/\n",
"//foundation.wikimedia.org/wiki/Terms_of_Use\n",
"//foundation.wikimedia.org/wiki/Privacy_policy\n",
"//www.wikimediafoundation.org/\n",
"https://foundation.wikimedia.org/wiki/Privacy_policy\n",
"/wiki/Wikipedia:About\n",
"/wiki/Wikipedia:General_disclaimer\n",
"//en.wikipedia.org/wiki/Wikipedia:Contact_us\n",
"//en.m.wikipedia.org/w/index.php?title=Main_Page&mobileaction=toggle_view_mobile\n",
"https://developer.wikimedia.org\n",
"https://stats.wikimedia.org/#/en.wikipedia.org\n",
"https://foundation.wikimedia.org/wiki/Cookie_statement\n",
"https://wikimediafoundation.org/\n",
"https://www.mediawiki.org/\n"
]
}
],
"source": [
"with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:\n",
" soup = BeautifulSoup(response, 'html.parser')\n",
" for anchor in soup.find_all('a'):\n",
" print(anchor.get('href', '/'))"
]
},
{
"cell_type": "markdown",
"id": "d99eb164",
"metadata": {},
"source": [
"Or using requests
instead of urlopen
."
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "9517d78f",
"metadata": {},
"outputs": [],
"source": [
"response = requests.get('https://en.wikipedia.org/wiki/Main_Page')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "5e6a32e0",
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup(response.content, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "e65aa013",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Wikipedia, the free encyclopedia"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup.title"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "ac48d1f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Wikipedia, the free encyclopedia'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup.title.string"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b01995cc",
"metadata": {},
"outputs": [],
"source": [
"nb_links = len(soup.find_all('a'))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "7b954377",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"348"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_links"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "ec9bf8be",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10052"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(soup.get_text())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1163159c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}