Bases: ConfigurableTool[ParseURLConfig]
Tool for parsing and extracting main content from URLs.
Fetches content from URL, removes unnecessary elements like scripts, styles, navigation, etc.,
and returns clean text content from the webpage's main body.
call
Fetch and parse content from the URL.
Returns:
Name | Type |
Description |
str |
str
|
Cleaned text content from the URL if successful, error message if parsing fails
|
Source code in mirascope/tools/web/_parse_url_content.py
| def call(self) -> str:
"""Fetch and parse content from the URL.
Returns:
str: Cleaned text content from the URL if successful, error message if parsing fails
"""
try:
# Fetch content from URL
response = requests.get(
self.url,
timeout=self._get_config().timeout,
)
response.raise_for_status()
# Parse HTML content
soup = BeautifulSoup(response.text, self._get_config().parser)
# Remove unwanted tags
unwanted_tags = ["script", "style", "nav", "header", "footer", "aside"]
for tag in unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
# Find main content section
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", class_=re.compile("content|main"))
)
# Extract and clean text
if main_content:
text = main_content.get_text(separator="\n", strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
# Remove empty lines and format
lines = (line.strip() for line in text.splitlines())
content = "\n".join(line for line in lines if line)
if not content:
return "No content found on the page"
return content
except requests.RequestException as e:
return f"Failed to fetch content from URL: {str(e)}"
except Exception as e: # pragma: no cover
return f"{type(e).__name__}: Failed to parse content from URL"
|