CS 200: Requests¶

The requests module is an alternative to the urllib module. See Requests: HTTP for Humans for details.

Here is a quick example.

In [13]:
import requests
In [14]:
r = requests.get("https://zoo.cs.yale.edu/classes/cs200/index.html")
In [15]:
r.status_code
Out[15]:
200
In [16]:
dir(r)
Out[16]:
['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']
In [17]:
for prop in dir(r):
    if prop.startswith("_"):
        pass
    else:
        print (prop, '\t\t', getattr(r,prop), '\n')
apparent_encoding 		 ascii 

close 		 <bound method Response.close of <Response [200]>> 

connection 		 <requests.adapters.HTTPAdapter object at 0x7f717f7c6fe0> 

content 		 b'<HTML>\n<HEAD>\n<TITLE>CPSC 200 - Introduction to Information Systems</TITLE>\n<BASE TARGET="_top">  <!-- Prevent Classes*V2 from branding links -->\n</HEAD>\n<BODY>\n<dl>\n  <dt><h1>CPSC 200 - Introduction to Information Systems</h1>\n  <dt><h2>FALL 2022</h2>\n</dl>\n\n<!-- Menu bar -->\n<hr>\n<a href="index.html"></a> \n<a href="index.html">[Home]</a> \n<a href="syllabus.html">[Syllabus]</a> \n<a href="contact.html">[Contact Info]</a> \n<a href="announcements.html">[Announcements]</a>\n<a href="lectures.html">[Lectures]</a>\n<a href="assignments.html">[Assignments]</a>\n<a href="http://cpsc.yale.edu">[Computer Science Department]</a>\n<hr>\n<!-- End menu bar -->\n\n<!----------------------------------------------------------------->\n\n\n<H3>Course Information</H3>\n<ul>\n<li>\n<a href="https://developers.google.com/edu/python/">\nGoogle\'s Python Class</a>,\na gentle introduction to Python. \nProgramming exercises available on the zoo at\n/c/cs200/\n\n<li><a target=we href="https://www.python.org/">\nPython.org</a>. Everything you need online.\n\n\n<li> <a href="https://yale.instructure.com/courses/78895"\ntarget=can>\n    Canvas</a> course site, including Discussion.\n\n<li> <a target=qw href="lectures/cs200.html">CS 200 Jupyter Notebooks</a>\n\n<li> <a target=rt href="https://pymotw.com/2/#">\nPython module of the week</a> A guide to many, many\nstandard, tested and available Python modules.\n\n\n<li> \n<a href="mailto:cs200help@cs.yale.edu">\ncs200help@cs.yale.edu</a>.\nEmail alias to reach all instructional staff members with a question.\n\n<li> \n<a href="materials/zootut/2014-zoo.pdf">\nZoo tutorial, Spring 2014 edition.</a>\n\n<li>\n<a href="http://www.cs.yale.edu/homes/aspnes/classes/223/notes.html#zoo">\nThe Zoo and the Zoo Annex</a>, including instructions for\nremote access to the Zoo, courtesy of Prof. Aspnes.\n\n<li> <a href="https://zoo.cs.yale.edu/classes/cs200/UNIX.html">Slade\'s gentle introduction to UNIX.</a>\n<li> \n<a href="materials/submit-instructions.txt">\n  Instructions for the submit script.</a>\n<a href="materials/CS200_Submit.pdf">\nBetter instructions for the submit script.</a>\n\n<li>\n<a href="materials/Submit-Guide.txt">\nSubmitting Homework Remotely</a>.\n\n<!--\n<li>\n<a href="materials/autograde.txt">\nautograde instructions</a>.\n-->\n<li>\n<a target=wq href="https://www.python.org/dev/peps/pep-0008/">\nPython style guide.</a>\n\n\n\n\n<hr>\n\n<li>\nMidterm 1: Thursday October 13, 7pm.  WLH 207.\n\n<li>\nMidterm 2: Thursday November 10, 7pm. WLH 207.\n\n<li>\nFinal exam: Monday December 19, 9am. RTBA.\n\n</ul>\n\n<hr>\n<!----------------------------------------------------------------->\n\n<script language="JavaScript">\ndocument.write("Last modified: " + document.lastModified)\n</script>\n\n\n</body>\n</html>\n\n' 

cookies 		 <RequestsCookieJar[]> 

elapsed 		 0:00:00.076179 

encoding 		 UTF-8 

headers 		 {'Date': 'Mon, 19 Sep 2022 18:10:10 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '"a81-5e907fd36a9ec"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'} 

history 		 [] 

is_permanent_redirect 		 False 

is_redirect 		 False 

iter_content 		 <bound method Response.iter_content of <Response [200]>> 

iter_lines 		 <bound method Response.iter_lines of <Response [200]>> 

json 		 <bound method Response.json of <Response [200]>> 

links 		 {} 

next 		 None 

ok 		 True 

raise_for_status 		 <bound method Response.raise_for_status of <Response [200]>> 

raw 		 <urllib3.response.HTTPResponse object at 0x7f71afd923e0> 

reason 		 OK 

request 		 <PreparedRequest [GET]> 

status_code 		 200 

text 		 <HTML>
<HEAD>
<TITLE>CPSC 200 - Introduction to Information Systems</TITLE>
<BASE TARGET="_top">  <!-- Prevent Classes*V2 from branding links -->
</HEAD>
<BODY>
<dl>
  <dt><h1>CPSC 200 - Introduction to Information Systems</h1>
  <dt><h2>FALL 2022</h2>
</dl>

<!-- Menu bar -->
<hr>
<a href="index.html"></a> 
<a href="index.html">[Home]</a> 
<a href="syllabus.html">[Syllabus]</a> 
<a href="contact.html">[Contact Info]</a> 
<a href="announcements.html">[Announcements]</a>
<a href="lectures.html">[Lectures]</a>
<a href="assignments.html">[Assignments]</a>
<a href="http://cpsc.yale.edu">[Computer Science Department]</a>
<hr>
<!-- End menu bar -->

<!----------------------------------------------------------------->


<H3>Course Information</H3>
<ul>
<li>
<a href="https://developers.google.com/edu/python/">
Google's Python Class</a>,
a gentle introduction to Python. 
Programming exercises available on the zoo at
/c/cs200/

<li><a target=we href="https://www.python.org/">
Python.org</a>. Everything you need online.


<li> <a href="https://yale.instructure.com/courses/78895"
target=can>
    Canvas</a> course site, including Discussion.

<li> <a target=qw href="lectures/cs200.html">CS 200 Jupyter Notebooks</a>

<li> <a target=rt href="https://pymotw.com/2/#">
Python module of the week</a> A guide to many, many
standard, tested and available Python modules.


<li> 
<a href="mailto:cs200help@cs.yale.edu">
cs200help@cs.yale.edu</a>.
Email alias to reach all instructional staff members with a question.

<li> 
<a href="materials/zootut/2014-zoo.pdf">
Zoo tutorial, Spring 2014 edition.</a>

<li>
<a href="http://www.cs.yale.edu/homes/aspnes/classes/223/notes.html#zoo">
The Zoo and the Zoo Annex</a>, including instructions for
remote access to the Zoo, courtesy of Prof. Aspnes.

<li> <a href="https://zoo.cs.yale.edu/classes/cs200/UNIX.html">Slade's gentle introduction to UNIX.</a>
<li> 
<a href="materials/submit-instructions.txt">
  Instructions for the submit script.</a>
<a href="materials/CS200_Submit.pdf">
Better instructions for the submit script.</a>

<li>
<a href="materials/Submit-Guide.txt">
Submitting Homework Remotely</a>.

<!--
<li>
<a href="materials/autograde.txt">
autograde instructions</a>.
-->
<li>
<a target=wq href="https://www.python.org/dev/peps/pep-0008/">
Python style guide.</a>




<hr>

<li>
Midterm 1: Thursday October 13, 7pm.  WLH 207.

<li>
Midterm 2: Thursday November 10, 7pm. WLH 207.

<li>
Final exam: Monday December 19, 9am. RTBA.

</ul>

<hr>
<!----------------------------------------------------------------->

<script language="JavaScript">
document.write("Last modified: " + document.lastModified)
</script>


</body>
</html>

 

url 		 https://zoo.cs.yale.edu/classes/cs200/index.html 

In [18]:
r.encoding
Out[18]:
'UTF-8'
In [19]:
for line in r.iter_lines():
    print (len(line))
6
6
61
69
7
6
4
61
24
5
0
17
4
26
32
39
42
48
38
44
64
4
21
0
68
0
0
27
4
4
52
26
33
45
9
0
48
43
0
0
57
11
49
0
73
0
48
51
46
0
0
5
39
26
69
0
5
40
38
0
4
73
57
51
0
103
5
44
41
37
46
0
4
37
33
0
4
4
34
27
3
4
62
23
0
0
0
0
4
0
4
46
0
4
46
0
4
42
0
5
0
4
68
0
30
57
9
0
0
7
7
0
In [20]:
r.url
Out[20]:
'https://zoo.cs.yale.edu/classes/cs200/index.html'
In [21]:
type(r.text)
Out[21]:
str
In [22]:
type(r.content)
Out[22]:
bytes
In [23]:
len(r.text)
Out[23]:
2689
In [24]:
r.text[:40]
Out[24]:
'<HTML>\n<HEAD>\n<TITLE>CPSC 200 - Introduc'
In [25]:
r.reason
Out[25]:
'OK'
In [26]:
import re
In [28]:
matches = re.findall('href=',r.text)
In [31]:
len(matches)
Out[31]:
22
In [34]:
requests.codes.ok
Out[34]:
200
In [38]:
r.headers
Out[38]:
{'Date': 'Mon, 19 Sep 2022 18:10:10 GMT', 'Server': 'Apache/2.4.6 (Red Hat Enterprise Linux)', 'Last-Modified': 'Mon, 19 Sep 2022 13:50:05 GMT', 'ETag': '"a81-5e907fd36a9ec"', 'Accept-Ranges': 'bytes', 'Content-Length': '2689', 'X-Cnection': 'close', 'Content-Type': 'text/html; charset=UTF-8'}

Beautiful Soup¶

See https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)

In [39]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
In [40]:
with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:
    soup = BeautifulSoup(response, 'html.parser')
    for anchor in soup.find_all('a'):
        print(anchor.get('href', '/'))
/
#mw-head
#searchInput
/wiki/Wikipedia
/wiki/Free_content
/wiki/Encyclopedia
/wiki/Help:Introduction_to_Wikipedia
/wiki/Special:Statistics
/wiki/English_language
/wiki/File:Queen_Elizabeth_II_in_March_2015.jpg
/wiki/Elizabeth_II
/wiki/Monarchy_of_the_United_Kingdom
/wiki/Commonwealth_realm
/wiki/Duke_of_York
/wiki/George_VI
/wiki/Queen_Elizabeth_The_Queen_Mother
/wiki/Heir_presumptive
/wiki/Abdication_of_Edward_VIII
/wiki/Edward_VIII
/wiki/Auxiliary_Territorial_Service
/wiki/Prince_Philip,_Duke_of_Edinburgh
/wiki/Death_and_funeral_of_Prince_Philip,_Duke_of_Edinburgh
/wiki/List_of_monarchs_in_Britain_by_length_of_reign
/wiki/Death_of_Diana,_Princess_of_Wales
/wiki/Diana,_Princess_of_Wales
/wiki/Death_and_state_funeral_of_Elizabeth_II
/wiki/Balmoral_Castle
/wiki/Charles_III
/wiki/Elizabeth_II
/wiki/Rotavirus
/wiki/NASA_Astronaut_Group_2
/wiki/September_2019_events_in_the_U.S._repo_market
/wiki/Wikipedia:Today%27s_featured_article/September_2022
https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/
/wiki/Wikipedia:Featured_articles
/wiki/Wikipedia:About_Today%27s_featured_article
/wiki/File:Funeral_of_Edward_VII_-1910_-cropped.JPG
/wiki/Royal_Navy_State_Funeral_Gun_Carriage
/wiki/State_funeral_of_Queen_Victoria
/wiki/Westminster_Abbey
/wiki/James_O%27Donnell_(organist)
/wiki/Death_and_state_funeral_of_Elizabeth_II
/wiki/State_visit_by_Elizabeth_II_to_Spain
/wiki/Douglas_Chandor
/wiki/State_hearse
/wiki/Leverton_%26_Sons
/wiki/Funeral_directors_to_the_Royal_Household
/wiki/Windsor_Castle
/wiki/Shaw_Farm,_Windsor
/wiki/Queen%27s_Road_East_(song)
/wiki/Lo_Ta-yu
/wiki/Wikipedia:Recent_additions
/wiki/Help:Your_first_article
/wiki/Template_talk:Did_you_know
/wiki/File:Jason_Sudeikis_South_by_Southwest_2019_(cropped).jpg
/wiki/2022_Kyrgyzstan%E2%80%93Tajikistan_clashes
/wiki/2022_Swedish_general_election
/wiki/Sweden_Democrats
/wiki/Moderate_Party
/wiki/Christian_Democrats_(Sweden)
/wiki/Liberals_(Sweden)
/wiki/Riksdag
/wiki/Ted_Lasso
/wiki/Jason_Sudeikis
/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series
/wiki/Succession_(TV_series)
/wiki/Primetime_Emmy_Award_for_Outstanding_Drama_Series
/wiki/74th_Primetime_Emmy_Awards
/wiki/Jean-Luc_Godard
/wiki/Portal:Current_events
/wiki/Death_and_state_funeral_of_Elizabeth_II
/wiki/2022_Russian_invasion_of_Ukraine
/wiki/Deaths_in_2022
/wiki/John_Stearns
/wiki/Harry_Booth_(coach)
/wiki/Naresh_Kumar_(tennis)
/wiki/Shelby_Jordan
/wiki/Michael_DeGroote
/wiki/Eddie_Butler_(rugby_union)
/wiki/Wikipedia:In_the_news/Candidates
/wiki/September_19
/wiki/International_Talk_Like_a_Pirate_Day
/wiki/File:Plantagenet,_Edward,_The_Black_Prince,_Iconic_Image.JPG
/wiki/1356
/wiki/Hundred_Years%27_War
/wiki/Edward_the_Black_Prince
/wiki/Battle_of_Poitiers
/wiki/John_II_of_France
/wiki/1944
/wiki/World_War_II
/wiki/Moscow_Armistice
/wiki/Continuation_War
/wiki/1950
/wiki/Korean_War
/wiki/Battle_of_Nam_River
/wiki/1985
/wiki/1985_Mexico_City_earthquake
/wiki/Moment_magnitude_scale
/wiki/Mexico_City
/wiki/2011
/wiki/Mariano_Rivera
/wiki/Trevor_Hoffman
/wiki/Major_League_Baseball
/wiki/Save_(baseball)
/wiki/Leo_VI_the_Wise
/wiki/Mabel_Vernon
/wiki/Ashot_Nadanian
/wiki/September_18
/wiki/September_19
/wiki/September_20
/wiki/Wikipedia:Selected_anniversaries/September
https://lists.wikimedia.org/postorius/lists/daily-article-l.lists.wikimedia.org/
/wiki/List_of_days_of_the_year
/wiki/File:The_Good_Place_careta.png
/wiki/The_Good_Place
/wiki/Fantasy_television
/wiki/Television_comedy
/wiki/List_of_The_Good_Place_episodes
/wiki/Michael_Schur
/wiki/NBC
/wiki/Kristen_Bell
/wiki/Afterlife
/wiki/Ted_Danson
/wiki/Heaven
/wiki/Utopia
/wiki/Righteousness
/wiki/Pitch_(filmmaking)
/wiki/The_Good_Place_(season_1)
/wiki/The_Good_Place_(season_2)
/wiki/The_Good_Place_(season_3)
/wiki/The_Good_Place_(season_4)
/wiki/Primetime_Emmy_Awards
/wiki/Primetime_Emmy_Award_for_Outstanding_Comedy_Series
/wiki/Shout!_Factory
/wiki/List_of_The_Good_Place_episodes
/wiki/List_of_songs_recorded_by_Kyla
/wiki/List_of_birds_of_Ontario
/wiki/List_of_accolades_received_by_The_Mandalorian
/wiki/Wikipedia:Today%27s_featured_list/September_2022
/wiki/Wikipedia:Featured_lists
/wiki/File:Hrh_Princess_Elizabeth_in_the_Auxiliary_Territorial_Service,_April_1945_TR2832.jpg
/wiki/Women_in_World_War_II
/wiki/Home_front_during_World_War_II
/wiki/Elizabeth_II
/wiki/Subaltern_(military)
/wiki/Auxiliary_Territorial_Service
/wiki/British_Army
/wiki/Ministry_of_Information_(United_Kingdom)
/wiki/User:Angerey
/wiki/Template:POTD/2022-09-18
/wiki/Template:POTD/2022-09-17
/wiki/Template:POTD/2022-09-16
/wiki/Wikipedia:Picture_of_the_day/Archive
/wiki/Wikipedia:Featured_pictures
/wiki/Wikipedia:Community_portal
/wiki/Wikipedia:Village_pump
/wiki/Wikipedia:News
/wiki/Wikipedia:Teahouse
/wiki/Wikipedia:Help_desk
/wiki/Wikipedia:Reference_desk
/wiki/Wikipedia:Contents/Portals
/wiki/Wikimedia_Foundation
https://wikimediafoundation.org/our-work/wikimedia-projects/
https://commons.wikimedia.org/wiki/
https://commons.wikimedia.org/wiki/
https://www.mediawiki.org/wiki/
https://www.mediawiki.org/wiki/
https://meta.wikimedia.org/wiki/
https://meta.wikimedia.org/wiki/
https://en.wikibooks.org/wiki/
https://en.wikibooks.org/wiki/
https://www.wikidata.org/wiki/
https://www.wikidata.org/wiki/
https://en.wikinews.org/wiki/
https://en.wikinews.org/wiki/
https://en.wikiquote.org/wiki/
https://en.wikiquote.org/wiki/
https://en.wikisource.org/wiki/
https://en.wikisource.org/wiki/
https://species.wikimedia.org/wiki/
https://species.wikimedia.org/wiki/
https://en.wikiversity.org/wiki/
https://en.wikiversity.org/wiki/
https://en.wikivoyage.org/wiki/
https://en.wikivoyage.org/wiki/
https://en.wiktionary.org/wiki/
https://en.wiktionary.org/wiki/
/wiki/English_language
https://meta.wikimedia.org/wiki/List_of_Wikipedias
https://ar.wikipedia.org/wiki/
https://de.wikipedia.org/wiki/
https://es.wikipedia.org/wiki/
https://fr.wikipedia.org/wiki/
https://it.wikipedia.org/wiki/
https://nl.wikipedia.org/wiki/
https://ja.wikipedia.org/wiki/
https://pl.wikipedia.org/wiki/
https://pt.wikipedia.org/wiki/
https://ru.wikipedia.org/wiki/
https://sv.wikipedia.org/wiki/
https://uk.wikipedia.org/wiki/
https://vi.wikipedia.org/wiki/
https://zh.wikipedia.org/wiki/
https://id.wikipedia.org/wiki/
https://ms.wikipedia.org/wiki/
https://zh-min-nan.wikipedia.org/wiki/
https://bg.wikipedia.org/wiki/
https://ca.wikipedia.org/wiki/
https://cs.wikipedia.org/wiki/
https://da.wikipedia.org/wiki/
https://eo.wikipedia.org/wiki/
https://eu.wikipedia.org/wiki/
https://fa.wikipedia.org/wiki/
https://he.wikipedia.org/wiki/
https://ko.wikipedia.org/wiki/
https://hu.wikipedia.org/wiki/
https://no.wikipedia.org/wiki/
https://ro.wikipedia.org/wiki/
https://sr.wikipedia.org/wiki/
https://sh.wikipedia.org/wiki/
https://fi.wikipedia.org/wiki/
https://tr.wikipedia.org/wiki/
https://ast.wikipedia.org/wiki/
https://bn.wikipedia.org/wiki/
https://bs.wikipedia.org/wiki/
https://et.wikipedia.org/wiki/
https://el.wikipedia.org/wiki/
https://simple.wikipedia.org/wiki/
https://gl.wikipedia.org/wiki/
https://hr.wikipedia.org/wiki/
https://lv.wikipedia.org/wiki/
https://lt.wikipedia.org/wiki/
https://ml.wikipedia.org/wiki/
https://mk.wikipedia.org/wiki/
https://nn.wikipedia.org/wiki/
https://sq.wikipedia.org/wiki/
https://sk.wikipedia.org/wiki/
https://sl.wikipedia.org/wiki/
https://th.wikipedia.org/wiki/
https://en.wikipedia.org/w/index.php?title=Main_Page&oldid=1108085777
/wiki/Special:MyTalk
/wiki/Special:MyContributions
/w/index.php?title=Special:CreateAccount&returnto=Main+Page
/w/index.php?title=Special:UserLogin&returnto=Main+Page
/wiki/Main_Page
/wiki/Talk:Main_Page
/wiki/Main_Page
/w/index.php?title=Main_Page&action=edit
/w/index.php?title=Main_Page&action=history
/wiki/Main_Page
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:WhatLinksHere/Main_Page
/wiki/Special:RecentChangesLinked/Main_Page
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:SpecialPages
/w/index.php?title=Main_Page&oldid=1108085777
/w/index.php?title=Main_Page&action=info
/w/index.php?title=Special:CiteThisPage&page=Main_Page&id=1108085777&wpFormIdentifier=titleform
https://www.wikidata.org/wiki/Special:EntityPage/Q5296
/w/index.php?title=Special:DownloadAsPdf&page=Main_Page&action=show-download-screen
/w/index.php?title=Main_Page&printable=yes
https://commons.wikimedia.org/wiki/Main_Page
https://www.mediawiki.org/wiki/MediaWiki
https://meta.wikimedia.org/wiki/Main_Page
https://wikisource.org/wiki/Main_Page
https://species.wikimedia.org/wiki/Main_Page
https://en.wikibooks.org/wiki/Main_Page
https://www.wikidata.org/wiki/Wikidata:Main_Page
https://wikimania.wikimedia.org/wiki/2022:Wikimania
https://en.wikinews.org/wiki/Main_Page
https://en.wikiquote.org/wiki/Main_Page
https://en.wikisource.org/wiki/Main_Page
https://en.wikiversity.org/wiki/Wikiversity:Main_Page
https://en.wikivoyage.org/wiki/Main_Page
https://en.wiktionary.org/wiki/Wiktionary:Main_Page
https://ar.wikipedia.org/wiki/
https://bn.wikipedia.org/wiki/
https://bg.wikipedia.org/wiki/
https://bs.wikipedia.org/wiki/
https://ca.wikipedia.org/wiki/
https://cs.wikipedia.org/wiki/
https://da.wikipedia.org/wiki/
https://de.wikipedia.org/wiki/
https://et.wikipedia.org/wiki/
https://el.wikipedia.org/wiki/
https://es.wikipedia.org/wiki/
https://eo.wikipedia.org/wiki/
https://eu.wikipedia.org/wiki/
https://fa.wikipedia.org/wiki/
https://fr.wikipedia.org/wiki/
https://gl.wikipedia.org/wiki/
https://ko.wikipedia.org/wiki/
https://hr.wikipedia.org/wiki/
https://id.wikipedia.org/wiki/
https://it.wikipedia.org/wiki/
https://he.wikipedia.org/wiki/
https://ka.wikipedia.org/wiki/
https://lv.wikipedia.org/wiki/
https://lt.wikipedia.org/wiki/
https://hu.wikipedia.org/wiki/
https://mk.wikipedia.org/wiki/
https://ms.wikipedia.org/wiki/
https://nl.wikipedia.org/wiki/
https://ja.wikipedia.org/wiki/
https://no.wikipedia.org/wiki/
https://nn.wikipedia.org/wiki/
https://pl.wikipedia.org/wiki/
https://pt.wikipedia.org/wiki/
https://ro.wikipedia.org/wiki/
https://ru.wikipedia.org/wiki/
https://simple.wikipedia.org/wiki/
https://sk.wikipedia.org/wiki/
https://sl.wikipedia.org/wiki/
https://sr.wikipedia.org/wiki/
https://sh.wikipedia.org/wiki/
https://fi.wikipedia.org/wiki/
https://sv.wikipedia.org/wiki/
https://th.wikipedia.org/wiki/
https://tr.wikipedia.org/wiki/
https://uk.wikipedia.org/wiki/
https://vi.wikipedia.org/wiki/
https://zh.wikipedia.org/wiki/
//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License
//creativecommons.org/licenses/by-sa/3.0/
//foundation.wikimedia.org/wiki/Terms_of_Use
//foundation.wikimedia.org/wiki/Privacy_policy
//www.wikimediafoundation.org/
https://foundation.wikimedia.org/wiki/Privacy_policy
/wiki/Wikipedia:About
/wiki/Wikipedia:General_disclaimer
//en.wikipedia.org/wiki/Wikipedia:Contact_us
//en.m.wikipedia.org/w/index.php?title=Main_Page&mobileaction=toggle_view_mobile
https://developer.wikimedia.org
https://stats.wikimedia.org/#/en.wikipedia.org
https://foundation.wikimedia.org/wiki/Cookie_statement
https://wikimediafoundation.org/
https://www.mediawiki.org/

Or using requests instead of urlopen.

In [44]:
response = requests.get('https://en.wikipedia.org/wiki/Main_Page')
In [45]:
soup = BeautifulSoup(response.content, 'html.parser')
In [46]:
soup.title
Out[46]:
<title>Wikipedia, the free encyclopedia</title>
In [47]:
soup.title.string
Out[47]:
'Wikipedia, the free encyclopedia'
In [48]:
nb_links = len(soup.find_all('a'))
In [49]:
nb_links
Out[49]:
348
In [50]:
len(soup.get_text())
Out[50]:
10052
In [ ]: