Deztaca - Academia online para destacar en tu trabajo

Comentario en la clase Creación de DataFrames

hace 2 meses 💬 Comentario

Buena tarde, yo no pude realizar la importación de las tablas de Wikipedia, el código de error que recibí está debajo. ¿Alguna sugerencia para arreglar el error, por favor?

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 df_web = pd.read_html("https://es.wikipedia.org/wiki/Poblaci%C3%B3n_mundial")
      2 df_web

File ~\anaconda3\Lib\site-packages\pandas\io\html.py:1240, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend, storage_options)
   1224 if isinstance(io, str) and not any(
   1225     [
   1226         is_file_like(io),
   (...)
   1230     ]
   1231 ):
   1232     warnings.warn(
   1233         "Passing literal html to 'read_html' is deprecated and "
   1234         "will be removed in a future version. To read from a "
   (...)
   1237         stacklevel=find_stack_level(),
   1238     )
-> 1240 return _parse(
   1241     flavor=flavor,
   1242     io=io,
   1243     match=match,
   1244     header=header,
   1245     index_col=index_col,
   1246     skiprows=skiprows,
   1247     parse_dates=parse_dates,
   1248     thousands=thousands,
   1249     attrs=attrs,
   1250     encoding=encoding,
   1251     decimal=decimal,
   1252     converters=converters,
   1253     na_values=na_values,
   1254     keep_default_na=keep_default_na,
   1255     displayed_only=displayed_only,
   1256     extract_links=extract_links,
   1257     dtype_backend=dtype_backend,
   1258     storage_options=storage_options,
   1259 )

File ~\anaconda3\Lib\site-packages\pandas\io\html.py:983, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, storage_options, **kwargs)
    972 p = parser(
    973     io,
    974     compiled_match,
   (...)
    979     storage_options,
    980 )
    982 try:
--> 983     tables = p.parse_tables()
    984 except ValueError as caught:
    985     # if `io` is an io-like object, check if it's seekable
    986     # and try to rewind it before trying the next parser
    987     if hasattr(io, "seekable") and io.seekable():

File ~\anaconda3\Lib\site-packages\pandas\io\html.py:249, in _HtmlFrameParser.parse_tables(self)
    241 def parse_tables(self):
    242     """
    243     Parse and return all tables from the DOM.
    244 
   (...)
    247     list of parsed (header, body, footer) tuples from tables.
    248     """
--> 249     tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
    250     return (self._parse_thead_tbody_tfoot(table) for table in tables)

File ~\anaconda3\Lib\site-packages\pandas\io\html.py:806, in _LxmlFrameParser._build_doc(self)
    804             pass
    805     else:
--> 806         raise e
    807 else:
    808     if not hasattr(r, "text_content"):

File ~\anaconda3\Lib\site-packages\pandas\io\html.py:785, in _LxmlFrameParser._build_doc(self)
    783 try:
    784     if is_url(self.io):
--> 785         with get_handle(
    786             self.io, "r", storage_options=self.storage_options
    787         ) as f:
    788             r = parse(f.handle, parser=parser)
    789     else:
    790         # try to parse the input in the simplest way

File ~\anaconda3\Lib\site-packages\pandas\io\common.py:728, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    725     codecs.lookup_error(errors)
    727 # open URLs
--> 728 ioargs = _get_filepath_or_buffer(
    729     path_or_buf,
    730     encoding=encoding,
    731     compression=compression,
    732     mode=mode,
    733     storage_options=storage_options,
    734 )
    736 handle = ioargs.filepath_or_buffer
    737 handles: list[BaseBuffer]

File ~\anaconda3\Lib\site-packages\pandas\io\common.py:384, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    382 # assuming storage_options is to be interpreted as headers
    383 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 384 with urlopen(req_info) as req:
    385     content_encoding = req.headers.get("Content-Encoding", None)
    386     if content_encoding == "gzip":
    387         # Override compression based on Content-Encoding header

File ~\anaconda3\Lib\site-packages\pandas\io\common.py:289, in urlopen(*args, **kwargs)
    283 """
    284 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    285 the stdlib.
    286 """
    287 import urllib.request
--> 289 return urllib.request.urlopen(*args, **kwargs)

File ~\anaconda3\Lib\urllib\request.py:189, in urlopen(url, data, timeout, context)
    187 else:
    188     opener = _opener
--> 189 return opener.open(url, data, timeout)

File ~\anaconda3\Lib\urllib\request.py:495, in OpenerDirector.open(self, fullurl, data, timeout)
    493 for processor in self.process_response.get(protocol, []):
    494     meth = getattr(processor, meth_name)
--> 495     response = meth(req, response)
    497 return response

File ~\anaconda3\Lib\urllib\request.py:604, in HTTPErrorProcessor.http_response(self, request, response)
    601 # According to RFC 2616, "2xx" code indicates that the client's
    602 # request was successfully received, understood, and accepted.
    603 if not (200 <= code < 300):
--> 604     response = self.parent.error(
    605         'http', request, response, code, msg, hdrs)
    607 return response

File ~\anaconda3\Lib\urllib\request.py:533, in OpenerDirector.error(self, proto, *args)
    531 if http_err:
    532     args = (dict, 'default', 'http_error_default') + orig_args
--> 533     return self._call_chain(*args)

File ~\anaconda3\Lib\urllib\request.py:466, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    464 for handler in handlers:
    465     func = getattr(handler, meth_name)
--> 466     result = func(*args)
    467     if result is not None:
    468         return result

File ~\anaconda3\Lib\urllib\request.py:613, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
    612 def http_error_default(self, req, fp, code, msg, hdrs):
--> 613     raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 403: Forbidden

Respuestas 1

JuliaBravo490

hace 2 meses

Hola, Raimundo.
Te comparto la respuesta del profesor del curso:

"Ese error suele aparecer porque Wikipedia a veces bloquea el acceso automático cuando detecta muchas solicitudes o una conexión “sin navegador”.

Por favor prueba este código:

import pandas as pd
import requests

url = "https://es.wikipedia.org/wiki/Poblaci%C3%B3n_mundial"
headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",

"Accept-Language": "es-ES,es;q=0.9"
}
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
df_web = pd.read_html(r.text)
df_web

Por favor nos dejas saber si con ello puedes ejecutarlo.

Saludos,
Equipo Deztaca

Marzo es el mes de la IA en Deztaca | Clase en vivo privada este sábado 21 de marzo de 2026. Clic aquí →

Respuestas 1