{ "cells": [ { "cell_type": "markdown", "id": "61c8a73e", "metadata": {}, "source": [ "# CS 200: Requests\n", "\n", "" ] }, { "cell_type": "markdown", "id": "b306da8a", "metadata": {}, "source": [ "The requests module is an alternative to the urllib module.\n", "See Requests: HTTP for Humans for details.\n", "\n", "Here is a quick example." ] }, { "cell_type": "code", "execution_count": 1, "id": "fc9c4602", "metadata": {}, "outputs": [], "source": [ "import requests" ] }, { "cell_type": "code", "execution_count": 2, "id": "875b2bb1", "metadata": {}, "outputs": [], "source": [ "r = requests.get(\"https://zoo.cs.yale.edu/classes/cs200/index.html\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "936b30dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.status_code" ] }, { "cell_type": "code", "execution_count": 4, "id": "4aea6a41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['__attrs__',\n", " '__bool__',\n", " '__class__',\n", " '__delattr__',\n", " '__dict__',\n", " '__dir__',\n", " '__doc__',\n", " '__enter__',\n", " '__eq__',\n", " '__exit__',\n", " '__format__',\n", " '__ge__',\n", " '__getattribute__',\n", " '__getstate__',\n", " '__gt__',\n", " '__hash__',\n", " '__init__',\n", " '__init_subclass__',\n", " '__iter__',\n", " '__le__',\n", " '__lt__',\n", " '__module__',\n", " '__ne__',\n", " '__new__',\n", " '__nonzero__',\n", " '__reduce__',\n", " '__reduce_ex__',\n", " '__repr__',\n", " '__setattr__',\n", " '__setstate__',\n", " '__sizeof__',\n", " '__str__',\n", " '__subclasshook__',\n", " '__weakref__',\n", " '_content',\n", " '_content_consumed',\n", " '_next',\n", " 'apparent_encoding',\n", " 'close',\n", " 'connection',\n", " 'content',\n", " 'cookies',\n", " 'elapsed',\n", " 'encoding',\n", " 'headers',\n", " 'history',\n", " 'is_permanent_redirect',\n", " 'is_redirect',\n", " 'iter_content',\n", " 'iter_lines',\n", " 'json',\n", " 'links',\n", " 'next',\n", " 'ok',\n", " 'raise_for_status',\n", " 'raw',\n", " 'reason',\n", " 'request',\n", " 'status_code',\n", " 'text',\n", " 'url']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir(r)" ] }, { "cell_type": "code", "execution_count": 5, "id": "deec028c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "apparent_encoding \t\t ascii \n", "\n", "close \t\t > \n", "\n", "connection \t\t \n", "\n", "content \t\t b'\\n\\nCPSC 200 - Introduction to Information Systems\\n \\n\\n\\n
\\n

CPSC 200 - Introduction to Information Systems

\\n

FALL 2022

\\n
\\n\\n\\n
\\n \\n[Home] \\n[Syllabus] \\n[Contact Info] \\n[Announcements]\\n[Lectures]\\n[Assignments]\\n[Computer Science Department]\\n
\\n\\n\\n\\n\\n\\n

Course Information

\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n' \n", "\n", "cookies \t\t \n", "\n", "elapsed \t\t 0:00:00.089930 \n", "\n", "encoding \t\t UTF-8 \n", "\n", "headers \t\t {'Date': 'Mon, 19 Sep 2022 20:59:51 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '\"a81-5e907fd36a9ec\"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'} \n", "\n", "history \t\t [] \n", "\n", "is_permanent_redirect \t\t False \n", "\n", "is_redirect \t\t False \n", "\n", "iter_content \t\t > \n", "\n", "iter_lines \t\t > \n", "\n", "json \t\t > \n", "\n", "links \t\t {} \n", "\n", "next \t\t None \n", "\n", "ok \t\t True \n", "\n", "raise_for_status \t\t > \n", "\n", "raw \t\t \n", "\n", "reason \t\t OK \n", "\n", "request \t\t \n", "\n", "status_code \t\t 200 \n", "\n", "text \t\t \n", "\n", "CPSC 200 - Introduction to Information Systems\n", " \n", "\n", "\n", "
\n", "

CPSC 200 - Introduction to Information Systems

\n", "

FALL 2022

\n", "
\n", "\n", "\n", "
\n", " \n", "[Home] \n", "[Syllabus] \n", "[Contact Info] \n", "[Announcements]\n", "[Lectures]\n", "[Assignments]\n", "[Computer Science Department]\n", "
\n", "\n", "\n", "\n", "\n", "\n", "

Course Information

\n", "\n", "\n", "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", " \n", "\n", "url \t\t https://zoo.cs.yale.edu/classes/cs200/index.html \n", "\n" ] } ], "source": [ "for prop in dir(r):\n", " if prop.startswith(\"_\"):\n", " pass\n", " else:\n", " print (prop, '\\t\\t', getattr(r,prop), '\\n')" ] }, { "cell_type": "code", "execution_count": 6, "id": "8e46ba0d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'UTF-8'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.encoding" ] }, { "cell_type": "code", "execution_count": 7, "id": "2b5bc24e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6\n", "6\n", "61\n", "69\n", "7\n", "6\n", "4\n", "61\n", "24\n", "5\n", "0\n", "17\n", "4\n", "26\n", "32\n", "39\n", "42\n", "48\n", "38\n", "44\n", "64\n", "4\n", "21\n", "0\n", "68\n", "0\n", "0\n", "27\n", "4\n", "4\n", "52\n", "26\n", "33\n", "45\n", "9\n", "0\n", "48\n", "43\n", "0\n", "0\n", "57\n", "11\n", "49\n", "0\n", "73\n", "0\n", "48\n", "51\n", "46\n", "0\n", "0\n", "5\n", "39\n", "26\n", "69\n", "0\n", "5\n", "40\n", "38\n", "0\n", "4\n", "73\n", "57\n", "51\n", "0\n", "103\n", "5\n", "44\n", "41\n", "37\n", "46\n", "0\n", "4\n", "37\n", "33\n", "0\n", "4\n", "4\n", "34\n", "27\n", "3\n", "4\n", "62\n", "23\n", "0\n", "0\n", "0\n", "0\n", "4\n", "0\n", "4\n", "46\n", "0\n", "4\n", "46\n", "0\n", "4\n", "42\n", "0\n", "5\n", "0\n", "4\n", "68\n", "0\n", "30\n", "57\n", "9\n", "0\n", "0\n", "7\n", "7\n", "0\n" ] } ], "source": [ "for line in r.iter_lines():\n", " print (len(line))" ] }, { "cell_type": "code", "execution_count": 8, "id": "88519f86", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://zoo.cs.yale.edu/classes/cs200/index.html'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.url" ] }, { "cell_type": "code", "execution_count": 9, "id": "5a1235b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(r.text)" ] }, { "cell_type": "code", "execution_count": 10, "id": "80958135", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bytes" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(r.content)" ] }, { "cell_type": "code", "execution_count": 11, "id": "5b00b1c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2689" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(r.text)" ] }, { "cell_type": "code", "execution_count": 12, "id": "e8556ee8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n\\nCPSC 200 - Introduc'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.text[:40]" ] }, { "cell_type": "code", "execution_count": 13, "id": "38b9ec85", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'OK'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.reason" ] }, { "cell_type": "code", "execution_count": 14, "id": "d1de646e", "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 15, "id": "485a5e36", "metadata": {}, "outputs": [], "source": [ "matches = re.findall('href=',r.text)" ] }, { "cell_type": "code", "execution_count": 16, "id": "dd920cea", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "22" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(matches)" ] }, { "cell_type": "code", "execution_count": 17, "id": "4f8068c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests.codes.ok" ] }, { "cell_type": "code", "execution_count": 18, "id": "6517e11f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Date': 'Mon, 19 Sep 2022 20:59:51 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '\"a81-5e907fd36a9ec\"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r.headers" ] }, { "cell_type": "markdown", "id": "c93286bd", "metadata": {}, "source": [ "## Beautiful Soup\n", "\n", "See <a target=dkdkd href=\"https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)\">https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)</a>\n", "\n", "Also, see <a target=dkdkd href=\"https://www.scrapingbee.com/blog/python-web-scraping-beautiful-soup/\">BeautifulSoup tutorial</a>" ] }, { "cell_type": "code", "execution_count": 19, "id": "5f1d8e8a", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 20, "id": "2bcfad21", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/\n", "#mw-head\n", "#searchInput\n", "/wiki/Wikipedia\n", "/wiki/Free_content\n", "/wiki/Encyclopedia\n", "/wiki/Help:Introduction_to_Wikipedia\n", "/wiki/Special:Statistics\n", "/wiki/English_language\n", "/wiki/File:Queen_Elizabeth_II_in_March_2015.jpg\n", "/wiki/Elizabeth_II\n", "/wiki/Monarchy_of_the_United_Kingdom\n", "/wiki/Commonwealth_realm\n", "/wiki/Duke_of_York\n", "/wiki/George_VI\n", "/wiki/Queen_Elizabeth_The_Queen_Mother\n", "/wiki/Heir_presumptive\n", "/wiki/Abdication_of_Edward_VIII\n", "/wiki/Edward_VIII\n", "/wiki/Auxiliary_Territorial_Service\n", "/wiki/Prince_Philip,_Duke_of_Edinburgh\n", "/wiki/Death_and_funeral_of_Prince_Philip,_Duke_of_Edinburgh\n", "/wiki/List_of_monarchs_in_Britain_by_length_of_reign\n", "/wiki/Death_of_Diana,_Princess_of_Wales\n", "/wiki/Diana,_Princess_of_Wales\n", "/wiki/Death_and_state_funeral_of_Elizabeth_II\n", "/wiki/Balmoral_Castle\n", "/wiki/Charles_III\n", "/wiki/Elizabeth_II\n", "/wiki/Rotavirus\n", "/wiki/NASA_Astronaut_Group_2\n", "/wiki/September_2019_events_in_the_U.S._repo_market\n", "/wiki/Wikipedia:Today%27s_featured_article/September_2022\n", "https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/\n", "/wiki/Wikipedia:Featured_articles\n", "/wiki/Wikipedia:About_Today%27s_featured_article\n", "/wiki/File:Funeral_of_Edward_VII_-1910_-cropped.JPG\n", "/wiki/Royal_Navy_State_Funeral_Gun_Carriage\n", "/wiki/State_funeral_of_Queen_Victoria\n", "/wiki/Westminster_Abbey\n", "/wiki/James_O%27Donnell_(organist)\n", "/wiki/Death_and_state_funeral_of_Elizabeth_II\n", "/wiki/State_visit_by_Elizabeth_II_to_Spain\n", "/wiki/Douglas_Chandor\n", "/wiki/State_hearse\n", "/wiki/Leverton_%26_Sons\n", "/wiki/Funeral_directors_to_the_Royal_Household\n", "/wiki/Windsor_Castle\n", "/wiki/Shaw_Farm,_Windsor\n", "/wiki/Queen%27s_Road_East_(song)\n", "/wiki/Lo_Ta-yu\n", "/wiki/Wikipedia:Recent_additions\n", "/wiki/Help:Your_first_article\n", "/wiki/Template_talk:Did_you_know\n", "/wiki/File:Jason_Sudeikis_South_by_Southwest_2019_(cropped).jpg\n", "/wiki/2022_Kyrgyzstan%E2%80%93Tajikistan_clashes\n", "/wiki/2022_Swedish_general_election\n", "/wiki/Sweden_Democrats\n", "/wiki/Moderate_Party\n", "/wiki/Christian_Democrats_(Sweden)\n", "/wiki/Liberals_(Sweden)\n", "/wiki/Riksdag\n", "/wiki/Ted_Lasso\n", "/wiki/Jason_Sudeikis\n", "/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series\n", "/wiki/Succession_(TV_series)\n", "/wiki/Primetime_Emmy_Award_for_Outstanding_Drama_Series\n", "/wiki/74th_Primetime_Emmy_Awards\n", "/wiki/Jean-Luc_Godard\n", "/wiki/Portal:Current_events\n", "/wiki/Death_and_state_funeral_of_Elizabeth_II\n", "/wiki/2022_Russian_invasion_of_Ukraine\n", "/wiki/Deaths_in_2022\n", "/wiki/John_Stearns\n", "/wiki/Harry_Booth_(coach)\n", "/wiki/Naresh_Kumar_(tennis)\n", "/wiki/Shelby_Jordan\n", "/wiki/Michael_DeGroote\n", "/wiki/Eddie_Butler_(rugby_union)\n", "/wiki/Wikipedia:In_the_news/Candidates\n", "/wiki/September_19\n", "/wiki/International_Talk_Like_a_Pirate_Day\n", "/wiki/File:Plantagenet,_Edward,_The_Black_Prince,_Iconic_Image.JPG\n", "/wiki/1356\n", "/wiki/Hundred_Years%27_War\n", "/wiki/Edward_the_Black_Prince\n", "/wiki/Battle_of_Poitiers\n", "/wiki/John_II_of_France\n", "/wiki/1944\n", "/wiki/World_War_II\n", "/wiki/Moscow_Armistice\n", "/wiki/Continuation_War\n", "/wiki/1950\n", "/wiki/Korean_War\n", "/wiki/Battle_of_Nam_River\n", "/wiki/1985\n", "/wiki/1985_Mexico_City_earthquake\n", "/wiki/Moment_magnitude_scale\n", "/wiki/Mexico_City\n", "/wiki/2011\n", "/wiki/Mariano_Rivera\n", "/wiki/Trevor_Hoffman\n", "/wiki/Major_League_Baseball\n", "/wiki/Save_(baseball)\n", "/wiki/Leo_VI_the_Wise\n", "/wiki/Mabel_Vernon\n", "/wiki/Ashot_Nadanian\n", "/wiki/September_18\n", "/wiki/September_19\n", "/wiki/September_20\n", "/wiki/Wikipedia:Selected_anniversaries/September\n", "https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/\n", "/wiki/List_of_days_of_the_year\n", "/wiki/File:The_Good_Place_careta.png\n", "/wiki/The_Good_Place\n", "/wiki/Fantasy_television\n", "/wiki/Television_comedy\n", "/wiki/List_of_The_Good_Place_episodes\n", "/wiki/Michael_Schur\n", "/wiki/NBC\n", "/wiki/Kristen_Bell\n", "/wiki/Afterlife\n", "/wiki/Ted_Danson\n", "/wiki/Heaven\n", "/wiki/Utopia\n", "/wiki/Righteousness\n", "/wiki/Pitch_(filmmaking)\n", "/wiki/The_Good_Place_(season_1)\n", "/wiki/The_Good_Place_(season_2)\n", "/wiki/The_Good_Place_(season_3)\n", "/wiki/The_Good_Place_(season_4)\n", "/wiki/Primetime_Emmy_Awards\n", "/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series\n", "/wiki/Shout!_Factory\n", "/wiki/List_of_The_Good_Place_episodes\n", "/wiki/List_of_songs_recorded_by_Kyla\n", "/wiki/List_of_birds_of_Ontario\n", "/wiki/List_of_accolades_received_by_The_Mandalorian\n", "/wiki/Wikipedia:Today%27s_featured_list/September_2022\n", "/wiki/Wikipedia:Featured_lists\n", "/wiki/File:Hrh_Princess_Elizabeth_in_the_Auxiliary_Territorial_Service,_April_1945_TR2832.jpg\n", "/wiki/Women_in_World_War_II\n", "/wiki/Home_front_during_World_War_II\n", "/wiki/Elizabeth_II\n", "/wiki/Subaltern_(military)\n", "/wiki/Auxiliary_Territorial_Service\n", "/wiki/British_Army\n", "/wiki/Ministry_of_Information_(United_Kingdom)\n", "/wiki/User:Angerey\n", "/wiki/Template:POTD/2022-09-18\n", "/wiki/Template:POTD/2022-09-17\n", "/wiki/Template:POTD/2022-09-16\n", "/wiki/Wikipedia:Picture_of_the_day/Archive\n", "/wiki/Wikipedia:Featured_pictures\n", "/wiki/Wikipedia:Community_portal\n", "/wiki/Wikipedia:Village_pump\n", "/wiki/Wikipedia:News\n", "/wiki/Wikipedia:Teahouse\n", "/wiki/Wikipedia:Help_desk\n", "/wiki/Wikipedia:Reference_desk\n", "/wiki/Wikipedia:Contents/Portals\n", "/wiki/Wikimedia_Foundation\n", "https://wikimediafoundation.org/our-work/wikimedia-projects/\n", "https://commons.wikimedia.org/wiki/\n", "https://commons.wikimedia.org/wiki/\n", "https://www.mediawiki.org/wiki/\n", "https://www.mediawiki.org/wiki/\n", "https://meta.wikimedia.org/wiki/\n", "https://meta.wikimedia.org/wiki/\n", "https://en.wikibooks.org/wiki/\n", "https://en.wikibooks.org/wiki/\n", "https://www.wikidata.org/wiki/\n", "https://www.wikidata.org/wiki/\n", "https://en.wikinews.org/wiki/\n", "https://en.wikinews.org/wiki/\n", "https://en.wikiquote.org/wiki/\n", "https://en.wikiquote.org/wiki/\n", "https://en.wikisource.org/wiki/\n", "https://en.wikisource.org/wiki/\n", "https://species.wikimedia.org/wiki/\n", "https://species.wikimedia.org/wiki/\n", "https://en.wikiversity.org/wiki/\n", "https://en.wikiversity.org/wiki/\n", "https://en.wikivoyage.org/wiki/\n", "https://en.wikivoyage.org/wiki/\n", "https://en.wiktionary.org/wiki/\n", "https://en.wiktionary.org/wiki/\n", "/wiki/English_language\n", "https://meta.wikimedia.org/wiki/List_of_Wikipedias\n", "https://ar.wikipedia.org/wiki/\n", "https://de.wikipedia.org/wiki/\n", "https://es.wikipedia.org/wiki/\n", "https://fr.wikipedia.org/wiki/\n", "https://it.wikipedia.org/wiki/\n", "https://nl.wikipedia.org/wiki/\n", "https://ja.wikipedia.org/wiki/\n", "https://pl.wikipedia.org/wiki/\n", "https://pt.wikipedia.org/wiki/\n", "https://ru.wikipedia.org/wiki/\n", "https://sv.wikipedia.org/wiki/\n", "https://uk.wikipedia.org/wiki/\n", "https://vi.wikipedia.org/wiki/\n", "https://zh.wikipedia.org/wiki/\n", "https://id.wikipedia.org/wiki/\n", "https://ms.wikipedia.org/wiki/\n", "https://zh-min-nan.wikipedia.org/wiki/\n", "https://bg.wikipedia.org/wiki/\n", "https://ca.wikipedia.org/wiki/\n", "https://cs.wikipedia.org/wiki/\n", "https://da.wikipedia.org/wiki/\n", "https://eo.wikipedia.org/wiki/\n", "https://eu.wikipedia.org/wiki/\n", "https://fa.wikipedia.org/wiki/\n", "https://he.wikipedia.org/wiki/\n", "https://ko.wikipedia.org/wiki/\n", "https://hu.wikipedia.org/wiki/\n", "https://no.wikipedia.org/wiki/\n", "https://ro.wikipedia.org/wiki/\n", "https://sr.wikipedia.org/wiki/\n", "https://sh.wikipedia.org/wiki/\n", "https://fi.wikipedia.org/wiki/\n", "https://tr.wikipedia.org/wiki/\n", "https://ast.wikipedia.org/wiki/\n", "https://bn.wikipedia.org/wiki/\n", "https://bs.wikipedia.org/wiki/\n", "https://et.wikipedia.org/wiki/\n", "https://el.wikipedia.org/wiki/\n", "https://simple.wikipedia.org/wiki/\n", "https://gl.wikipedia.org/wiki/\n", "https://hr.wikipedia.org/wiki/\n", "https://lv.wikipedia.org/wiki/\n", "https://lt.wikipedia.org/wiki/\n", "https://ml.wikipedia.org/wiki/\n", "https://mk.wikipedia.org/wiki/\n", "https://nn.wikipedia.org/wiki/\n", "https://sq.wikipedia.org/wiki/\n", "https://sk.wikipedia.org/wiki/\n", "https://sl.wikipedia.org/wiki/\n", "https://th.wikipedia.org/wiki/\n", "https://en.wikipedia.org/w/index.php?title=Main_Page&oldid=1108085777\n", "/wiki/Special:MyTalk\n", "/wiki/Special:MyContributions\n", "/w/index.php?title=Special:CreateAccount&returnto=Main+Page\n", "/w/index.php?title=Special:UserLogin&returnto=Main+Page\n", "/wiki/Main_Page\n", "/wiki/Talk:Main_Page\n", "/wiki/Main_Page\n", "/w/index.php?title=Main_Page&action=edit\n", "/w/index.php?title=Main_Page&action=history\n", "/wiki/Main_Page\n", "/wiki/Main_Page\n", "/wiki/Wikipedia:Contents\n", "/wiki/Portal:Current_events\n", "/wiki/Special:Random\n", "/wiki/Wikipedia:About\n", "//en.wikipedia.org/wiki/Wikipedia:Contact_us\n", "https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en\n", "/wiki/Help:Contents\n", "/wiki/Help:Introduction\n", "/wiki/Wikipedia:Community_portal\n", "/wiki/Special:RecentChanges\n", "/wiki/Wikipedia:File_Upload_Wizard\n", "/wiki/Special:WhatLinksHere/Main_Page\n", "/wiki/Special:RecentChangesLinked/Main_Page\n", "/wiki/Wikipedia:File_Upload_Wizard\n", "/wiki/Special:SpecialPages\n", "/w/index.php?title=Main_Page&oldid=1108085777\n", "/w/index.php?title=Main_Page&action=info\n", "/w/index.php?title=Special:CiteThisPage&page=Main_Page&id=1108085777&wpFormIdentifier=titleform\n", "https://www.wikidata.org/wiki/Special:EntityPage/Q5296\n", "/w/index.php?title=Special:DownloadAsPdf&page=Main_Page&action=show-download-screen\n", "/w/index.php?title=Main_Page&printable=yes\n", "https://commons.wikimedia.org/wiki/Main_Page\n", "https://www.mediawiki.org/wiki/MediaWiki\n", "https://meta.wikimedia.org/wiki/Main_Page\n", "https://wikisource.org/wiki/Main_Page\n", "https://species.wikimedia.org/wiki/Main_Page\n", "https://en.wikibooks.org/wiki/Main_Page\n", "https://www.wikidata.org/wiki/Wikidata:Main_Page\n", "https://wikimania.wikimedia.org/wiki/2022:Wikimania\n", "https://en.wikinews.org/wiki/Main_Page\n", "https://en.wikiquote.org/wiki/Main_Page\n", "https://en.wikisource.org/wiki/Main_Page\n", "https://en.wikiversity.org/wiki/Wikiversity:Main_Page\n", "https://en.wikivoyage.org/wiki/Main_Page\n", "https://en.wiktionary.org/wiki/Wiktionary:Main_Page\n", "https://ar.wikipedia.org/wiki/\n", "https://bn.wikipedia.org/wiki/\n", "https://bg.wikipedia.org/wiki/\n", "https://bs.wikipedia.org/wiki/\n", "https://ca.wikipedia.org/wiki/\n", "https://cs.wikipedia.org/wiki/\n", "https://da.wikipedia.org/wiki/\n", "https://de.wikipedia.org/wiki/\n", "https://et.wikipedia.org/wiki/\n", "https://el.wikipedia.org/wiki/\n", "https://es.wikipedia.org/wiki/\n", "https://eo.wikipedia.org/wiki/\n", "https://eu.wikipedia.org/wiki/\n", "https://fa.wikipedia.org/wiki/\n", "https://fr.wikipedia.org/wiki/\n", "https://gl.wikipedia.org/wiki/\n", "https://ko.wikipedia.org/wiki/\n", "https://hr.wikipedia.org/wiki/\n", "https://id.wikipedia.org/wiki/\n", "https://it.wikipedia.org/wiki/\n", "https://he.wikipedia.org/wiki/\n", "https://ka.wikipedia.org/wiki/\n", "https://lv.wikipedia.org/wiki/\n", "https://lt.wikipedia.org/wiki/\n", "https://hu.wikipedia.org/wiki/\n", "https://mk.wikipedia.org/wiki/\n", "https://ms.wikipedia.org/wiki/\n", "https://nl.wikipedia.org/wiki/\n", "https://ja.wikipedia.org/wiki/\n", "https://no.wikipedia.org/wiki/\n", "https://nn.wikipedia.org/wiki/\n", "https://pl.wikipedia.org/wiki/\n", "https://pt.wikipedia.org/wiki/\n", "https://ro.wikipedia.org/wiki/\n", "https://ru.wikipedia.org/wiki/\n", "https://simple.wikipedia.org/wiki/\n", "https://sk.wikipedia.org/wiki/\n", "https://sl.wikipedia.org/wiki/\n", "https://sr.wikipedia.org/wiki/\n", "https://sh.wikipedia.org/wiki/\n", "https://fi.wikipedia.org/wiki/\n", "https://sv.wikipedia.org/wiki/\n", "https://th.wikipedia.org/wiki/\n", "https://tr.wikipedia.org/wiki/\n", "https://uk.wikipedia.org/wiki/\n", "https://vi.wikipedia.org/wiki/\n", "https://zh.wikipedia.org/wiki/\n", "//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License\n", "//creativecommons.org/licenses/by-sa/3.0/\n", "//foundation.wikimedia.org/wiki/Terms_of_Use\n", "//foundation.wikimedia.org/wiki/Privacy_policy\n", "//www.wikimediafoundation.org/\n", "https://foundation.wikimedia.org/wiki/Privacy_policy\n", "/wiki/Wikipedia:About\n", "/wiki/Wikipedia:General_disclaimer\n", "//en.wikipedia.org/wiki/Wikipedia:Contact_us\n", "//en.m.wikipedia.org/w/index.php?title=Main_Page&mobileaction=toggle_view_mobile\n", "https://developer.wikimedia.org\n", "https://stats.wikimedia.org/#/en.wikipedia.org\n", "https://foundation.wikimedia.org/wiki/Cookie_statement\n", "https://wikimediafoundation.org/\n", "https://www.mediawiki.org/\n" ] } ], "source": [ "with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:\n", " soup = BeautifulSoup(response, 'html.parser')\n", " for anchor in soup.find_all('a'):\n", " print(anchor.get('href', '/'))" ] }, { "cell_type": "markdown", "id": "d99eb164", "metadata": {}, "source": [ "Or using <code>requests</code> instead of <code>urlopen</code>." ] }, { "cell_type": "code", "execution_count": 21, "id": "9517d78f", "metadata": {}, "outputs": [], "source": [ "response = requests.get('https://en.wikipedia.org/wiki/Main_Page')" ] }, { "cell_type": "code", "execution_count": 22, "id": "5e6a32e0", "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(response.content, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 23, "id": "e65aa013", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<title>Wikipedia, the free encyclopedia" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.title" ] }, { "cell_type": "code", "execution_count": 24, "id": "ac48d1f3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Wikipedia, the free encyclopedia'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.title.string" ] }, { "cell_type": "code", "execution_count": 25, "id": "b01995cc", "metadata": {}, "outputs": [], "source": [ "nb_links = len(soup.find_all('a'))" ] }, { "cell_type": "code", "execution_count": 26, "id": "7b954377", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "348" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nb_links" ] }, { "cell_type": "code", "execution_count": 27, "id": "ec9bf8be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10052" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(soup.get_text())" ] }, { "cell_type": "code", "execution_count": null, "id": "1163159c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 5 }