piermesh/src/Components/hopper.py

102 lines
2.9 KiB
Python
Raw Normal View History

2024-11-26 17:43:02 +00:00
# NOTE: Used for requesting web pages
2024-07-28 11:21:15 +00:00
import requests
2024-08-12 10:29:58 +00:00
2024-11-26 17:43:02 +00:00
# NOTE: Used for parsing web pages
from bs4 import BeautifulSoup
2024-08-12 10:29:58 +00:00
2024-11-26 17:43:02 +00:00
# NOTE: Generic imports
2024-08-12 10:29:58 +00:00
import base64
import mimetypes
2024-11-23 03:34:39 +00:00
import logging
2024-08-12 10:29:58 +00:00
2024-11-23 03:34:39 +00:00
logger = logging.getLogger("__main__." + __name__)
2024-07-28 11:21:15 +00:00
2024-11-26 17:43:02 +00:00
2024-08-12 10:29:58 +00:00
def downloadFile(url, text=True, mimeType=None):
2024-11-26 17:43:02 +00:00
"""
Download resource from url and convert it to text or a data url
"""
2024-08-12 10:29:58 +00:00
fbytes = b""
with requests.get(url, stream=True) as r:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
fbytes += chunk
if text:
return fbytes.decode("utf-8")
else:
if mimeType == None:
mimeType, encoding = mimetypes.guess_type(url)
if mimeType == None:
2024-11-26 17:43:02 +00:00
raise ValueError(
2024-08-12 10:29:58 +00:00
"Couldnt guess mime type and none was supplied, cant encode to data url"
)
b64str = base64.b64encode(fbytes).decode("utf-8")
dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)
return dataUrl
2024-07-28 11:21:15 +00:00
2024-08-12 10:29:58 +00:00
def get(url: str, params=None, followTags=None):
2024-07-28 11:21:15 +00:00
"""
http/s get request
Parameters
----------
url: str
params
Requests (library) parameters
2024-08-12 10:29:58 +00:00
followTags
None or list of tags to download the src/href from
2024-07-28 11:21:15 +00:00
"""
2024-11-23 03:34:39 +00:00
logger.debug("Hopping to it")
# TODO: Non blocking requests
# WARN: Do not run self requests until this is fixed
2024-07-28 11:21:15 +00:00
r = requests.get(url, params=params)
2024-11-23 03:34:39 +00:00
logger.debug("Content retrieved, parsing")
2024-08-12 10:29:58 +00:00
r = {
"response": r.text,
"code": r.status_code,
"content-type": r.headers.get("content-type"),
}
2024-11-23 03:34:39 +00:00
logger.debug("Done parsing")
2024-08-12 10:29:58 +00:00
# TODO: Reject followtags if content type is other then html
if followTags != None:
soup = BeautifulSoup(r["response"], "html.parser")
# TODO: Checking for relative links
for tag in followTags:
if tag in ["img", "video"]:
for elem in soup.find_all(tag):
elem["src"] = downloadFile(elem["src"], text=False)
elif tag in ["link"]:
for elem in soup.find_all(tag):
if elem["rel"] == "stylesheet":
style = downloadFile(elem["href"])
elem.decompose()
soup.head.append_tag(soup.new_tag("style", string=style))
elif tag == "script":
for elem in soup.find_all(tag):
script = downloadFile(elem["src"])
elem["src"] = ""
elem.string = script
r["response"] = soup.text
2024-11-23 03:34:39 +00:00
logger.debug("Done hopping")
2024-08-12 10:29:58 +00:00
return r
2024-07-28 11:21:15 +00:00
def post(url: str, params=None):
"""
http/s post request
Parameters
----------
url: str
params
Requests (library) parameters
"""
2024-08-01 01:00:46 +00:00
r = requests.post(url, data=params)
2024-07-28 11:21:15 +00:00
r = {"response": r.text, "code": r.status_code}
2024-08-12 10:29:58 +00:00
return r