piermesh/src/Components/hopper.py

# NOTE: Used for requesting web pages
import requests

# NOTE: Used for parsing web pages
from bs4 import BeautifulSoup

# NOTE: Generic imports
import base64
import mimetypes
import logging

logger = logging.getLogger("__main__." + __name__)


def downloadFile(url, text=True, mimeType=None):
    """
    Download resource from url and convert it to text or a data url
    """
    fbytes = b""
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        for chunk in r.iter_content(chunk_size=8192):
            fbytes += chunk
    if text:
        return fbytes.decode("utf-8")
    else:
        if mimeType == None:
            mimeType, encoding = mimetypes.guess_type(url)
            if mimeType == None:
                raise ValueError(
                    "Couldnt guess mime type and none was supplied, cant encode to data url"
                )
        b64str = base64.b64encode(fbytes).decode("utf-8")
        dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)
        return dataUrl


def get(url: str, params=None, followTags=None):
    """
    http/s get request

    Parameters
    ----------
    url: str

    params
        Requests (library) parameters

    followTags
        None or list of tags to download the src/href from
    """
    logger.debug("Hopping to it")
    # TODO: Non blocking requests
    # WARN: Do not run self requests until this is fixed
    r = requests.get(url, params=params)
    logger.debug("Content retrieved, parsing")
    r = {
        "response": r.text,
        "code": r.status_code,
        "content-type": r.headers.get("content-type"),
    }
    logger.debug("Done parsing")
    
    # TODO: Reject followtags if content type is other then html
    if followTags != None:
        soup = BeautifulSoup(r["response"], "html.parser")
        # TODO: Checking for relative links
        for tag in followTags:
            if tag in ["img", "video"]:
                for elem in soup.find_all(tag):
                    elem["src"] = downloadFile(elem["src"], text=False)
            elif tag in ["link"]:
                for elem in soup.find_all(tag):
                    if elem["rel"] == "stylesheet":
                        style = downloadFile(elem["href"])
                        elem.decompose()
                        soup.head.append_tag(soup.new_tag("style", string=style))
            elif tag == "script":
                for elem in soup.find_all(tag):
                    script = downloadFile(elem["src"])
                    elem["src"] = ""
                    elem.string = script
        r["response"] = soup.text
    logger.debug("Done hopping")
    return r


def post(url: str, params=None):
    """
    http/s post request

    Parameters
    ----------
    url: str

    params
        Requests (library) parameters
    """
    r = requests.post(url, data=params)
    r = {"response": r.text, "code": r.status_code}
    return r
Prototype cleanup 1 2024-11-26 17:43:02 +00:00			`# NOTE: Used for requesting web pages`
Reset 2024-07-28 11:21:15 +00:00			`import requests`
Breaking push to show progress 2024-08-12 10:29:58 +00:00
Prototype cleanup 1 2024-11-26 17:43:02 +00:00			`# NOTE: Used for parsing web pages`
			`from bs4 import BeautifulSoup`
Breaking push to show progress 2024-08-12 10:29:58 +00:00
Prototype cleanup 1 2024-11-26 17:43:02 +00:00			`# NOTE: Generic imports`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`import base64`
			`import mimetypes`
Prototype fast forward 2024-11-23 03:34:39 +00:00			`import logging`
Breaking push to show progress 2024-08-12 10:29:58 +00:00
Prototype fast forward 2024-11-23 03:34:39 +00:00			`logger = logging.getLogger("__main__." + __name__)`
Reset 2024-07-28 11:21:15 +00:00
Prototype cleanup 1 2024-11-26 17:43:02 +00:00
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`def downloadFile(url, text=True, mimeType=None):`
Prototype cleanup 1 2024-11-26 17:43:02 +00:00			`"""`
			`Download resource from url and convert it to text or a data url`
			`"""`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`fbytes = b""`
			`with requests.get(url, stream=True) as r:`
			`r.raise_for_status()`
			`for chunk in r.iter_content(chunk_size=8192):`
			`fbytes += chunk`
			`if text:`
			`return fbytes.decode("utf-8")`
			`else:`
			`if mimeType == None:`
			`mimeType, encoding = mimetypes.guess_type(url)`
			`if mimeType == None:`
Prototype cleanup 1 2024-11-26 17:43:02 +00:00			`raise ValueError(`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`"Couldnt guess mime type and none was supplied, cant encode to data url"`
			`)`
			`b64str = base64.b64encode(fbytes).decode("utf-8")`
			`dataUrl = "data:{0};base64,{1}".format(mimeType, b64str)`
			`return dataUrl`
Reset 2024-07-28 11:21:15 +00:00
Breaking push to show progress 2024-08-12 10:29:58 +00:00
			`def get(url: str, params=None, followTags=None):`
Reset 2024-07-28 11:21:15 +00:00			`"""`
			`http/s get request`

			`Parameters`
			`----------`
			`url: str`

			`params`
			`Requests (library) parameters`
Breaking push to show progress 2024-08-12 10:29:58 +00:00
			`followTags`
			`None or list of tags to download the src/href from`
Reset 2024-07-28 11:21:15 +00:00			`"""`
Prototype fast forward 2024-11-23 03:34:39 +00:00			`logger.debug("Hopping to it")`
			`# TODO: Non blocking requests`
			`# WARN: Do not run self requests until this is fixed`
Reset 2024-07-28 11:21:15 +00:00			`r = requests.get(url, params=params)`
Prototype fast forward 2024-11-23 03:34:39 +00:00			`logger.debug("Content retrieved, parsing")`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`r = {`
			`"response": r.text,`
			`"code": r.status_code,`
			`"content-type": r.headers.get("content-type"),`
			`}`
Prototype fast forward 2024-11-23 03:34:39 +00:00			`logger.debug("Done parsing")`

Breaking push to show progress 2024-08-12 10:29:58 +00:00			`# TODO: Reject followtags if content type is other then html`
			`if followTags != None:`
			`soup = BeautifulSoup(r["response"], "html.parser")`
			`# TODO: Checking for relative links`
			`for tag in followTags:`
			`if tag in ["img", "video"]:`
			`for elem in soup.find_all(tag):`
			`elem["src"] = downloadFile(elem["src"], text=False)`
			`elif tag in ["link"]:`
			`for elem in soup.find_all(tag):`
			`if elem["rel"] == "stylesheet":`
			`style = downloadFile(elem["href"])`
			`elem.decompose()`
			`soup.head.append_tag(soup.new_tag("style", string=style))`
			`elif tag == "script":`
			`for elem in soup.find_all(tag):`
			`script = downloadFile(elem["src"])`
			`elem["src"] = ""`
			`elem.string = script`
			`r["response"] = soup.text`
Prototype fast forward 2024-11-23 03:34:39 +00:00			`logger.debug("Done hopping")`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`return r`
Reset 2024-07-28 11:21:15 +00:00

			`def post(url: str, params=None):`
			`"""`
			`http/s post request`

			`Parameters`
			`----------`
			`url: str`

			`params`
			`Requests (library) parameters`
			`"""`
First push 2024-08-01 01:00:46 +00:00			`r = requests.post(url, data=params)`
Reset 2024-07-28 11:21:15 +00:00			`r = {"response": r.text, "code": r.status_code}`
Breaking push to show progress 2024-08-12 10:29:58 +00:00			`return r`