from functools import lru_cache from pyparsing import ( CharsNotIn, Group, OneOrMore, Optional, Word, alphanums, alphas, quotedString, removeQuotes, ) @lru_cache def _make_attribute_parser(): key_parser = Word(alphas, alphanums + "_") quoted_value_parser = quotedString.setParseAction(removeQuotes) unquoted_value_parser = CharsNotIn(" =\"'") key_value_pair_parser = ( key_parser + "=" + Optional(quoted_value_parser | unquoted_value_parser, default="") ) multiple_pairs_parser = OneOrMore(Group(key_value_pair_parser)) return multiple_pairs_parser def parse_spaced_key_value_pairs(text: str) -> dict[str, str]: """ Parses a string of key-value pairs separated by spaces. :param text: String of key-value pairs separated by spaces. :return: List of key-value pairs. """ if not text: return {} rows = _make_attribute_parser().parseString(text, parseAll=True) data = {r[0]: r[2] for r in rows} return data def parse_spaced_key_value_pairs_html(text: str): html_version = f"" parsed_html = parse_html_tag(html_version) return parsed_html["attributes"] def parse_html_tag(html_tag): """ Parses a single HTML tag and returns a dictionary with the tag name and its attributes. Args: html_tag (str): A string representing the HTML tag to be parsed. Returns: dict: A dictionary with 'tagname' and 'attributes'. 'tagname' is a string and 'attributes' is a dictionary. """ from html.parser import HTMLParser class MyHTMLParser(HTMLParser): def __init__(self): super().__init__() self.tagname = "" self.attributes = {} def handle_starttag(self, tag, attrs): self.tagname = tag self.attributes = dict(attrs) parser = MyHTMLParser() parser.feed(html_tag) return {"tagname": parser.tagname, "attributes": parser.attributes} def parse_spaced_key_value_pairs_re(text: str) -> dict[str, str]: """ Parses a string of key-value pairs separated by spaces. :param text: String of key-value pairs separated by spaces. :return: List of key-value pairs. """ if not text: return {} import re # Building regex parts for readability key_pattern = r"(?P\w+)" quoted_value_pattern = r'(?:"[^"\\]*(?:\\.[^"\\]*)*"|\'[^\'\\]*(?:\\.[^\'\\]*)*\')' unquoted_value_pattern = r'[^\'"\s]*' value_pattern = f"(?P{quoted_value_pattern}|{unquoted_value_pattern})" # Complete pattern with named groups pattern = rf"{key_pattern}={value_pattern}" # Find all matches matches = re.findall(pattern, text) # Validate the query string format if not matches and text: raise ValueError("Invalid format") parsed_query = {} for key, value in matches: if (value.startswith('"') and value.endswith('"')) or ( value.startswith("'") and value.endswith("'") ): # Remove quotes and handle escape sequences value = bytes(value[1:-1], "utf-8").decode("unicode_escape") parsed_query[key] = value return parsed_query