diff --git a/.gitignore b/.gitignore index 2a0786d..5b2d750 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,9 @@ __pycache__/ .python-version .venv +# Local virtualenv freeze for this workspace +venv-requirements.txt + # MyPy .mypy_cache @@ -84,3 +87,4 @@ docs/_build/ # PyBuilder target/ assets/ +venv-requirements.txt diff --git a/README.md b/README.md index 21154d7..cb78ad8 100644 --- a/README.md +++ b/README.md @@ -1,133 +1,3 @@ -"### Install https://hyperglass.dev/installation/docker" - -mkdir -p /etc/hyperglass/svg - -cd /opt - -git clone https://github.com/CarlosSuporteISP/hyperglass_structured.git --depth=1 - -mv hyperglass_structured hyperglass - -cd /opt/hyperglass - -"### https://hyperglass.dev/configuration/overview" - -"### https://hyperglass.dev/configuration/config Change the files in the /etc/hyperglass folder after copying with your information or add something following the official doc" - -cp /opt/hyperglass/.samples/sample_config /etc/hyperglass/config.yaml - -cp /opt/hyperglass/.samples/sample_terms-and-conditions /etc/hyperglass/terms-and-conditions.md - -"### https://hyperglass.dev/configuration/devices Change the files in the /etc/hyperglass folder after copying with your information or add something following the official doc" - -cp /opt/hyperglass/.samples/sample_devices2 /etc/hyperglass/devices.yaml - -"### https://hyperglass.dev/configuration/directives Change the files in the /etc/hyperglass folder after copying with your information or add something following the official doc" - -cp /opt/hyperglass/.samples/sample_directives_huawei /etc/hyperglass/directives.yaml - -cp /opt/hyperglass/.samples/sample_directives_juniper /etc/hyperglass/directives.yaml - -cp /opt/hyperglass/.samples/sample_directives_mikrotik /etc/hyperglass/directives.yaml - -"### Environment Variables https://hyperglass.dev/installation/environment-variables" - -cp /opt/hyperglass/.samples/sample_hyperglass /etc/hyperglass/hyperglass.env - - -"###" - -You also need to add your AS prefixes to deny queries if you don't want others to look up your own prefixes from your hyperglass instance. - -In the directives file, there is a field that is usually commented out. This configuration is meant for devices like Huawei or MikroTik, but it is currently still using the default option from the directives. From what I've tested, putting the rules in the configuration folder (/etc/hyperglass/...) didn't work. If it works later, we can do everything within the directives file in /etc/hyperglass, but for now, it's okay to use the default. - -It's possible to create or use the ENTRYPOINT in the Dockerfile to change this at build time when starting the service, but I don't have time right now to stop and implement this. - -/opt/hyperglass/hyperglass/defaults/directives/huawei.py | /opt/hyperglass_structured/hyperglass/defaults/directives/mikrotik.py - -The code snippet, originally commented, should be modified to something like this: - - # DENY RULE FOR AS PREFIX - IPv4 - RuleWithIPv4( - condition="172.16.0.0/22", - ge="22", - le="32", - action="deny", - command="", - ), - - # DENY RULE FOR AS PREFIX - IPv6 - RuleWithIPv6( - condition="fd00:2::/32", - ge="32", - le="128", - action="deny", - command="", - ), - -mikrotik v6 - -command="ip route print detail without-paging where {target} in dst-address bgp and dst-address !=0.0.0.0/0", - -command="ipv6 route print detail without-paging where {target} in dst-address bgp and dst-address !=::/0", - -mikrotik v7 - -command="routing route print detail without-paging where {target} in dst-address bgp and dst-address !=0.0.0.0/0", - -command="routing route print detail without-paging where {target} in dst-address bgp and dst-address !=::/0", - - - -"###" - -"### Optional: Quickstart" - -cd /opt/hyperglass - -docker compose up - -"### Create a systemd service" - -cp /opt/hyperglass/.samples/hyperglass-docker.service /etc/hyperglass/hyperglass.service - -ln -s /etc/hyperglass/hyperglass.service /etc/systemd/system/hyperglass.service - -systemctl daemon-reload - -systemctl enable hyperglass - -systemctl start hyperglass - - - -"###" - - Acknowledgments: - - To thatmatt for this incredible project that I really like. Nothing against other Looking Glass (LG) projects. https://github.com/thatmattlove/hyperglass - - To remotti for the tips on Telegram, his attention, and for his fork https://github.com/remontti/hyperglass/tree/main, https://blog.remontti.com.br/7201, which is already quite deprecated due to its age (Node 14, etc.) and not being in Docker. This is why I decided to move to the official version. - - To the user \邪萬教教我/ @Yukaphoenix572 好呆. Thanks to a message from him in the Telegram group, my mind was opened to the solution after I searched through the conversations. - - To issue https://github.com/thatmattlove/hyperglass/issues/318 for the solution to queries that also weren't working on Tik-Tik (for those who use Claro). - - And of course, last but not least: to AIs. My apologies to those who don't like the "code vibe," but they help a lot. I used many of the six main AIs on the market, but only Manus truly managed to help me, contributing about 45% of the development, testing, adjustments, and descriptions. - -The total development time took over three weeks to get everything adjusted. Yes, I know I'm not that great at development, but I'm studying and improving. As I always say, in life and professionally, we always have something to learn; we never know everything. - -I also adjusted the official plugin (which wasn't working) for Huawei. - -The issue was the format in which the prefix was being passed to the device. Huawei expects the format 192.0.2.0 24 (with a space), but the official plugin was sending it in the 192.0.2.0/24 format (with a slash). - -The fix was made to adapt to the format that Huawei accepts for queries. - - - -"###" - -

@@ -157,6 +27,8 @@ hyperglass is intended to make implementing a looking glass too easy not to do, ## Features - BGP Route, BGP Community, BGP AS Path, Ping, & Traceroute, or [add your own commands](https://hyperglass.dev/configuration/directives). +- **Structured data output** with rich metadata for supported platforms +- **Enhanced traceroute** with ASN information, organization names, country codes, and IXP detection - Full IPv6 support - Customizable everything: features, theme, UI/API text, error messages, commands - Built-in support for: @@ -173,12 +45,17 @@ hyperglass is intended to make implementing a looking glass too easy not to do, - OpenBGPD - TNSR - VyOS +- **Structured BGP Route support** for: Arista EOS, FRRouting, Huawei VRP, Juniper Junos, MikroTik RouterOS +- **Structured Traceroute support** for: Arista EOS, FRRouting, Huawei VRP, Juniper Junos, MikroTik RouterOS/SwitchOS - Configurable support for any other [supported platform](https://hyperglass.dev/platforms) - Optionally access devices via an SSH proxy/jump server - Access-list/prefix-list style query control to whitelist or blacklist query targets - REST API with automatic, configurable OpenAPI documentation - Modern, responsive UI built on [ReactJS](https://reactjs.org/), with [NextJS](https://nextjs.org/) & [Chakra UI](https://chakra-ui.com/), written in [TypeScript](https://www.typescriptlang.org/) +- **AS path visualization** with interactive flow charts showing organization names +- **Offline IP enrichment** using BGP.tools bulk data and PeeringDB for maximum performance - Query multiple devices simultaneously +- **Concurrent processing** with non-blocking operations for improved performance - Browser-based DNS-over-HTTPS resolution of FQDN queries *To request support for a specific platform, please [submit a Github Issue](https://github.com/thatmattlove/hyperglass/issues/new) with the **feature** label.* @@ -206,5 +83,8 @@ hyperglass is built entirely on open-source software. Here are some of the aweso - [Litestar](https://litestar.dev) - [Pydantic](https://docs.pydantic.dev/latest/) - [Chakra UI](https://chakra-ui.com/) +- [React Flow](https://reactflow.dev/) - AS path visualization +- [BGP.tools](https://bgp.tools/) - IP enrichment data +- [PeeringDB](https://peeringdb.com/) - Network organization and IXP data [![GitHub](https://img.shields.io/github/license/thatmattlove/hyperglass?color=330036&style=for-the-badge)](https://github.com/thatmattlove/hyperglass/blob/main/LICENSE) diff --git a/docs/pages/configuration/config/structured-output.mdx b/docs/pages/configuration/config/structured-output.mdx index 1053023..ae53d9c 100644 --- a/docs/pages/configuration/config/structured-output.mdx +++ b/docs/pages/configuration/config/structured-output.mdx @@ -21,14 +21,20 @@ For external validation, hyperglass supports two backends: Additionally, hyperglass provides the ability to control which BGP communities are shown to the end user. -| Parameter | Type | Default Value | Description | -| :----------------------------- | :-------------- | :------------ | :------------------------------------------------------------------------------------------------------------------------------------- | -| `structured.rpki.mode` | String | router | Use `router` to use the router's view of the RPKI state, or `external` to use an external validation service. | -| `structured.rpki.backend` | String | cloudflare | When using `external` mode, choose `cloudflare` or `routinator` as the validation backend. | -| `structured.rpki.rpki_server_url` | String | | When using `routinator` backend, specify the base URL of your Routinator server (e.g., `http://rpki.example.com:3323`). | -| `structured.communities.mode` | String | deny | Use `deny` to deny any communities listed, `permit` to _only_ permit communities listed, or `name` to append friendly names. | -| `structured.communities.items` | List of Strings | | List of communities to match (used by `deny` and `permit` modes). | -| `structured.communities.names` | Dict | | Dictionary mapping BGP community codes to friendly names (used by `name` mode). | +For devices with structured traceroute support (Arista EOS, FRRouting, Huawei VRP, Juniper Junos, and MikroTik RouterOS), hyperglass can enhance the output with IP enrichment data including ASN information, organization names, country codes, and IXP detection using offline data from BGP.tools and PeeringDB. + +| Parameter | Type | Default Value | Description | +| :-------------------------------- | :-------------- | :------------ | :------------------------------------------------------------------------------------------------------------------------------------- | +| `structured.rpki.mode` | String | router | Use `router` to use the router's view of the RPKI state, or `external` to use an external validation service. | +| `structured.rpki.backend` | String | cloudflare | When using `external` mode, choose `cloudflare` or `routinator` as the validation backend. | +| `structured.rpki.rpki_server_url` | String | | When using `routinator` backend, specify the base URL of your Routinator server (e.g., `http://rpki.example.com:3323`). | +| `structured.communities.mode` | String | deny | Use `deny` to deny any communities listed, `permit` to _only_ permit communities listed, or `name` to append friendly names. | +| `structured.communities.items` | List of Strings | | List of communities to match (used by `deny` and `permit` modes). | +| `structured.communities.names` | Dict | | Dictionary mapping BGP community codes to friendly names (used by `name` mode). | +| `structured.ip_enrichment.cache_timeout` | Integer | 86400 | Cache timeout in seconds for IP enrichment data (minimum 24 hours/86400 seconds). | +| `structured.ip_enrichment.enrich_traceroute`| Boolean | true | When `structured:` is present, enable IP enrichment of traceroute hops (ASN, org, IXP). This must be true for enrichment to run. | +| `structured.enable_for_traceroute`| Boolean | (when structured present) true | When `structured:` is present this controls whether the structured traceroute table output is shown. Set to false to force raw router output. | +| `structured.enable_for_bgp_route`| Boolean | (when structured present) true | When `structured:` is present this controls whether the structured BGP route table output is shown. Set to false to force raw router output. | ### RPKI Examples @@ -104,3 +110,115 @@ structured: "65000:1102": "Upstream B Location 1" "65000:2000": "IXP Any" ``` + +### IP Enrichment Examples + + + **IP Enrichment Requirements** + + IP enrichment is currently supported for traceroute outputs on supported platforms. + + The system uses offline data from BGP.tools (1.3M+ CIDR entries) and PeeringDB for maximum performance and reliability. + + +#### Enable IP Enrichment for Traceroute + +```yaml filename="config.yaml" copy {2-4} +structured: + # Ensure `structured:` exists to enable structured output. By default the + # structured table output is enabled when this block is present. To disable + # the structured traceroute table, set `structured.enable_for_traceroute: false`. + ip_enrichment: + enrich_traceroute: true +``` + +#### Enable IP Enrichment with Custom Cache Timeout + +```yaml filename="config.yaml" copy {2-5} +structured: + ip_enrichment: + enrich_traceroute: true + cache_timeout: 172800 # 48 hours +``` + +#### Enable IP Enrichment for Traceroute + +```yaml filename="config.yaml" copy {2-4} +structured: + ip_enrichment: + enrich_traceroute: true + cache_timeout: 86400 # 24 hours (minimum) +``` + + + **Performance Considerations** + + - Initial cache loading may take 30-60 seconds on first startup + - Data is cached locally using pickle format for ultra-fast subsequent loads + - Cache files are stored in `/etc/hyperglass/ip_enrichment/` + - Minimum cache timeout is 24 hours (86400 seconds) to prevent excessive API usage + + +### Structured Traceroute Configuration + + + **Structured Traceroute Support** + + Structured traceroute with rich metadata is available for: + - **Arista EOS**: Parses Unix-style traceroute output with hostname, multiple RTT support, and MPLS labels + - **FRRouting**: Parses Unix-style traceroute output with load balancing and multi-path support + - **Huawei VRP**: Parses Unix-style traceroute output + - **Juniper Junos**: Parses traceroute output with MPLS labels, multipath, and partial timeouts + - **MikroTik RouterOS/SwitchOS**: Parses multi-table format with statistics + + When IP enrichment is enabled, traceroute hops are enhanced with ASN numbers, organization names, country codes, prefixes, and IXP detection. + + +#### Complete Structured Traceroute Setup + +```yaml filename="config.yaml" copy {2-12} +structured: + rpki: + mode: external + backend: routinator + rpki_server_url: "https://rpki.example.com" + communities: + mode: name + names: + "65000:1000": "Transit Routes" + "65000:2000": "Peer Routes" + ip_enrichment: + enrich_traceroute: true + cache_timeout: 86400 +``` + +#### Structured Traceroute with Cloudflare RPKI + +```yaml filename="config.yaml" copy {2-9} +structured: + rpki: + mode: external + backend: cloudflare + ip_enrichment: + enrich_traceroute: true +``` + +#### Minimal Structured Traceroute (No IP Enrichment) + +```yaml filename="config.yaml" copy {2-4} +structured: + ip_enrichment: + enrich_traceroute: false # Traceroute will show basic hop info without ASN/org data +``` + + + **IP Enrichment Dependency** + + Without IP enrichment enabled: + - Traceroute hops will only show IP addresses and RTT values + - No ASN, organization names, or country information will be displayed + - AS path visualization will be limited or unavailable + - IXP detection will not function + + For the full structured traceroute experience with rich metadata, `ip_enrichment.enrich_traceroute: true` is required. + diff --git a/hyperglass/api/__init__.py b/hyperglass/api/__init__.py index 414a450..34f185b 100644 --- a/hyperglass/api/__init__.py +++ b/hyperglass/api/__init__.py @@ -15,8 +15,17 @@ from hyperglass.constants import __version__ from hyperglass.exceptions import HyperglassError # Local -from .events import check_redis, init_ip_enrichment -from .routes import info, query, device, devices, queries +from .events import check_redis +from .routes import ( + info, + query, + device, + devices, + queries, + ip_enrichment_status, + ip_enrichment_refresh, + aspath_enrich, +) from .middleware import COMPRESSION_CONFIG, create_cors_config from .error_handlers import app_handler, http_handler, default_handler, validation_handler @@ -42,6 +51,9 @@ HANDLERS = [ queries, info, query, + ip_enrichment_status, + ip_enrichment_refresh, + aspath_enrich, ] if not STATE.settings.disable_ui: @@ -64,7 +76,7 @@ app = Litestar( ValidationException: validation_handler, Exception: default_handler, }, - on_startup=[check_redis, init_ip_enrichment], + on_startup=[check_redis], debug=STATE.settings.debug, cors_config=create_cors_config(state=STATE), compression_config=COMPRESSION_CONFIG, diff --git a/hyperglass/api/events.py b/hyperglass/api/events.py index 942c6e3..6185265 100644 --- a/hyperglass/api/events.py +++ b/hyperglass/api/events.py @@ -10,7 +10,7 @@ from litestar import Litestar from hyperglass.state import use_state from hyperglass.log import log -__all__ = ("check_redis", "init_ip_enrichment") +__all__ = ("check_redis",) async def check_redis(_: Litestar) -> t.NoReturn: @@ -19,25 +19,6 @@ async def check_redis(_: Litestar) -> t.NoReturn: cache.check() -async def init_ip_enrichment(_: Litestar) -> None: - """Initialize IP enrichment data at startup.""" - try: - params = use_state("params") - if not params.structured.ip_enrichment.enabled: - log.debug("IP enrichment disabled, skipping initialization") - return - except Exception as e: - log.debug(f"Could not check IP enrichment config: {e}") - return - - try: - from hyperglass.external.ip_enrichment import _service - - log.info("Initializing IP enrichment data at startup...") - success = await _service.ensure_data_loaded() - if success: - log.info("IP enrichment data loaded successfully at startup") - else: - log.warning("Failed to load IP enrichment data at startup") - except Exception as e: - log.error(f"Error initializing IP enrichment data: {e}") +# init_ip_enrichment removed: startup refresh is intentionally disabled and +# IP enrichment data is loaded on-demand when required. Keeping a no-op +# startup hook adds no value and may cause confusion. diff --git a/hyperglass/api/routes.py b/hyperglass/api/routes.py index 61c4cf6..a9ce00a 100644 --- a/hyperglass/api/routes.py +++ b/hyperglass/api/routes.py @@ -60,6 +60,27 @@ __all__ = ( ) +@post("/api/aspath/enrich") +async def aspath_enrich(data: dict) -> dict: + """Enrich a list of ASNs with organization names on demand. + + Expected JSON payload: { "as_path": [123, 456, ...] } + """ + try: + as_path = data.get("as_path", []) if isinstance(data, dict) else [] + if not as_path: + return {"success": False, "error": "No as_path provided"} + + # Convert to strings and call the existing bulk lookup + from hyperglass.external.ip_enrichment import lookup_asns_bulk + + asn_strings = [str(a) for a in as_path] + results = await lookup_asns_bulk(asn_strings) + return {"success": True, "asn_organizations": results} + except Exception as e: + return {"success": False, "error": str(e)} + + @get("/api/devices/{id:str}", dependencies={"devices": Provide(get_devices)}) async def device(devices: Devices, id: str) -> APIDevice: """Retrieve a device by ID.""" @@ -163,6 +184,39 @@ async def query(_state: HyperglassState, request: Request, data: Query) -> Query structured=data.device.structured_output or False, ) else: + # Best-effort: if IP enrichment is enabled, schedule a + # non-blocking background refresh so the service can + # update PeeringDB caches without relying on the client. + try: + from hyperglass.state import use_state + + params = use_state("params") + if ( + getattr(params, "structured", None) + and params.structured.ip_enrichment.enrich_traceroute + and getattr(params.structured, "enable_for_traceroute", None) + is not False + ): + try: + from hyperglass.external.ip_enrichment import ( + refresh_ip_enrichment_data, + ) + + async def _bg_refresh(): + try: + await refresh_ip_enrichment_data(force=False) + except Exception as e: + _log.debug("Background IP enrichment refresh failed: {}", e) + + # Schedule background refresh and don't await it. + asyncio.create_task(_bg_refresh()) + except Exception: + # If import or scheduling fails, proceed without refresh + pass + except Exception: + # If we can't access params, skip background refresh + pass + # Pass request to execution module output = await execute(data) @@ -183,18 +237,43 @@ async def query(_state: HyperglassState, request: Request, data: Query) -> Query else: raw_output = str(output) - # Only cache successful results - await loop.run_in_executor( - None, partial(cache.set_map_item, cache_key, "output", raw_output) - ) - await loop.run_in_executor( - None, partial(cache.set_map_item, cache_key, "timestamp", timestamp) - ) - await loop.run_in_executor( - None, partial(cache.expire, cache_key, expire_in=_state.params.cache.timeout) - ) + # Detect semantically-empty structured outputs and avoid caching them. + # Examples: + # - BGPRouteTable: {'count': 0, 'routes': []} + # - TracerouteResult: {'hops': []} + skip_cache_empty = False + try: + if json_output and isinstance(raw_output, dict): + # BGP route table empty + if "count" in raw_output and "routes" in raw_output: + if raw_output.get("count", 0) == 0 or not raw_output.get("routes"): + skip_cache_empty = True + # Traceroute result empty + if "hops" in raw_output and (not raw_output.get("hops")): + skip_cache_empty = True + except Exception: + # If any unexpected shape is encountered, don't skip caching by + # accident — fall back to normal behavior. + skip_cache_empty = False - _log.bind(cache_timeout=_state.params.cache.timeout).debug("Response cached") + if not skip_cache_empty: + # Only cache successful, non-empty results + await loop.run_in_executor( + None, partial(cache.set_map_item, cache_key, "output", raw_output) + ) + await loop.run_in_executor( + None, partial(cache.set_map_item, cache_key, "timestamp", timestamp) + ) + await loop.run_in_executor( + None, + partial(cache.expire, cache_key, expire_in=_state.params.cache.timeout), + ) + + _log.bind(cache_timeout=_state.params.cache.timeout).debug("Response cached") + else: + _log.bind(cache_key=cache_key).warning( + "Structured output was empty (e.g. 0 routes / 0 hops) - skipping cache to allow immediate retry" + ) runtime = int(round(elapsedtime, 0)) @@ -263,6 +342,21 @@ async def ip_enrichment_refresh(force: bool = False) -> dict: try: from hyperglass.external.ip_enrichment import refresh_ip_enrichment_data + # If enrichment is disabled in config, return a clear message + try: + from hyperglass.state import use_state + + params = use_state("params") + if ( + not getattr(params, "structured", None) + or not params.structured.ip_enrichment.enrich_traceroute + or getattr(params.structured, "enable_for_traceroute", None) is False + ): + return {"success": False, "message": "IP enrichment for traceroute is not enabled"} + except Exception: + # If config can't be read, proceed with refresh call and let it decide + pass + success = await refresh_ip_enrichment_data(force=force) return { "success": success, diff --git a/hyperglass/execution/enrichment.py b/hyperglass/execution/enrichment.py index 961c08e..b1bd193 100644 --- a/hyperglass/execution/enrichment.py +++ b/hyperglass/execution/enrichment.py @@ -13,25 +13,28 @@ async def enrich_output_with_ip_enrichment(output: OutputDataModel) -> OutputDat """Enrich output data with IP enrichment information.""" params = use_state("params") - # Check if IP enrichment is enabled in configuration - if not params.structured.ip_enrichment.enabled: - log.debug("IP enrichment disabled in configuration, skipping") + # If structured block isn't present or traceroute enrichment explicitly disabled, + # skip enrichment entirely. + if ( + not getattr(params, "structured", None) + or not params.structured.ip_enrichment.enrich_traceroute + or getattr(params.structured, "enable_for_traceroute", None) is False + ): + log.debug("IP enrichment for traceroute disabled or structured config missing, skipping") return output _log = log.bind(enrichment="ip_enrichment") _log.debug("Starting IP enrichment") try: - if isinstance(output, BGPRouteTable): - if params.structured.ip_enrichment.enrich_next_hop: - _log.debug("Enriching BGP route table with next-hop information") - await output.enrich_with_ip_enrichment() - _log.info(f"Enriched {len(output.routes)} BGP routes with next-hop data") - else: - _log.debug("Next-hop enrichment disabled, skipping BGP enrichment") - - elif isinstance(output, TracerouteResult): - if params.structured.ip_enrichment.enrich_traceroute: + if isinstance(output, TracerouteResult): + # Only enrich traceroute results when structured config exists, + # per-feature top-level flag isn't False, and ip_enrichment is enabled. + if ( + getattr(params, "structured", None) + and params.structured.ip_enrichment.enrich_traceroute + and getattr(params.structured, "enable_for_traceroute", None) is not False + ): _log.debug("Enriching traceroute hops with ASN information") await output.enrich_with_ip_enrichment() diff --git a/hyperglass/external/ip_enrichment.py b/hyperglass/external/ip_enrichment.py index 660e27d..5e49fab 100644 --- a/hyperglass/external/ip_enrichment.py +++ b/hyperglass/external/ip_enrichment.py @@ -1,17 +1,12 @@ -"""IP enrichment service - the main network lookup system for hyperglass. +"""IP enrichment: ASN and IXP lookups for hyperglass. -This completely replaces bgp.tools with bulk data approach using: -- BGP.tools static files for CIDR->ASN mapping -- BGP.tools ASN database for ASN->Organization names -- PeeringDB for IXP detection - -Core Functions: -- lookup_ip(ip_address) -> ASN number/name OR IXP name -- lookup_asn_name(asn_number) -> ASN organization name -- network_info(*ips) -> bulk lookup (for compatibility) +Uses bgp.tools for ASN lookups and PeeringDB for IXP prefixes. +Provides lookup_ip, lookup_asn_name and network_info compatibility APIs. """ import asyncio +import time +import fcntl import json import csv import pickle @@ -19,10 +14,140 @@ import typing as t from datetime import datetime, timedelta from ipaddress import ip_address, ip_network, IPv4Address, IPv6Address from pathlib import Path +import socket from hyperglass.log import log from hyperglass.state import use_state +# Process-wide lock to coordinate downloads across worker processes. +# Uses an on-disk lock directory so separate processes don't simultaneously +# download enrichment data and cause rate limits. + + +class _ProcessFileLock: + """Async-friendly, process-wide filesystem lock. + + Provides an async context manager that runs blocking mkdir/remove + operations in an executor so multiple processes can coordinate. + """ + + def __init__(self, lock_path: Path, timeout: int = 300, poll_interval: float = 0.1): + self.lock_path = lock_path + self.timeout = timeout + self.poll_interval = poll_interval + self._lock_dir: t.Optional[str] = None + # Small startup jitter (seconds) to reduce thundering herd on many + # worker processes starting at the same time. + self._startup_jitter = 0.25 + + def _acquire_blocking(self) -> None: + # Use atomic mkdir on a .lck directory as the lock primitive. + import os + import random + import json + import shutil + + lock_dir = str(self.lock_path) + ".lck" + + # Small jitter before first attempt to reduce concurrent mkdirs + time.sleep(random.uniform(0, self._startup_jitter)) + start = time.time() + + while True: + try: + # Try to create the lock directory atomically; on success we + # hold the lock. If it exists, retry until timeout. + os.mkdir(lock_dir) + + # Write a small owner metadata file to help debugging stale locks + try: + owner = {"pid": os.getpid(), "created": datetime.now().isoformat()} + with open(os.path.join(lock_dir, "owner.json"), "w") as f: + json.dump(owner, f) + except Exception: + # Not critical; proceed even if writing metadata fails + pass + + self._lock_dir = lock_dir + log.debug(f"Acquired process lock {lock_dir} (pid={os.getpid()})") + return + except FileExistsError: + # If the lock appears stale (older than timeout), try cleanup. + try: + owner_file = os.path.join(lock_dir, "owner.json") + mtime = None + if os.path.exists(owner_file): + mtime = os.path.getmtime(owner_file) + else: + mtime = os.path.getmtime(lock_dir) + + # If owner file/dir mtime is older than timeout, remove it + if (time.time() - mtime) >= self.timeout: + log.warning(f"Removing stale lock directory {lock_dir}") + try: + shutil.rmtree(lock_dir) + except Exception: + # If we can't remove it, we'll continue to wait until + # the timeout is reached by this acquisition attempt. + pass + # After attempted cleanup, loop and try mkdir again + continue + except Exception: + # Ignore issues during stale-check and continue waiting + pass + + if (time.time() - start) >= self.timeout: + raise TimeoutError(f"Timed out waiting for lock {self.lock_path}") + time.sleep(self.poll_interval) + + def _release_blocking(self) -> None: + import os + import shutil + + try: + if self._lock_dir: + try: + owner_file = os.path.join(self._lock_dir, "owner.json") + if os.path.exists(owner_file): + try: + os.remove(owner_file) + except Exception: + pass + + # Attempt to remove the directory. If it's empty, rmdir will + # succeed; if not, fall back to recursive removal as a best-effort. + try: + os.rmdir(self._lock_dir) + except Exception: + try: + shutil.rmtree(self._lock_dir) + except Exception: + log.debug(f"Failed to fully remove lock dir {self._lock_dir}") + + log.debug(f"Released process lock {self._lock_dir}") + self._lock_dir = None + except Exception: + # Best-effort; ignore errors removing the lock dir + pass + except Exception: + # Nothing we can do on release failure + pass + + async def __aenter__(self): + loop = asyncio.get_running_loop() + # Run blocking acquire in executor + await loop.run_in_executor(None, self._acquire_blocking) + return self + + async def __aexit__(self, exc_type, exc, tb): + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._release_blocking) + + +# Instantiate a process-global lock file in the data dir. The data dir may not yet +# exist at import time; the constant path is defined below and we'll initialize +# the actual _download_lock after the paths are declared. (See below.) + # Optional dependencies - graceful fallback if not available try: import httpx @@ -38,25 +163,20 @@ except ImportError: # File paths for persistent storage IP_ENRICHMENT_DATA_DIR = Path("/etc/hyperglass/ip_enrichment") -CIDR_DATA_FILE = IP_ENRICHMENT_DATA_DIR / "cidr_data.json" -ASN_DATA_FILE = IP_ENRICHMENT_DATA_DIR / "asn_data.json" -IXP_DATA_FILE = IP_ENRICHMENT_DATA_DIR / "ixp_data.json" +IXP_PICKLE_FILE = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" LAST_UPDATE_FILE = IP_ENRICHMENT_DATA_DIR / "last_update.txt" -COMBINED_CACHE_FILE = IP_ENRICHMENT_DATA_DIR / "combined_cache.pickle" -# Raw data files for debugging/inspection -RAW_TABLE_FILE = IP_ENRICHMENT_DATA_DIR / "table.jsonl" -RAW_ASNS_FILE = IP_ENRICHMENT_DATA_DIR / "asns.csv" - -# Data URLs -BGP_TOOLS_TABLE_URL = "https://bgp.tools/table.jsonl" -BGP_TOOLS_ASNS_URL = "https://bgp.tools/asns.csv" -PEERINGDB_IXPFX_URL = "https://www.peeringdb.com/api/ixpfx" - -# Cache duration (24 hours default, configurable) +# Cache duration (seconds). Default: 24 hours. Can be overridden in config. DEFAULT_CACHE_DURATION = 24 * 60 * 60 +# Lazily-created process-wide download lock. Create this after the data +# directory is ensured to exist to avoid open() failing due to a missing +# parent directory and to ensure the lock file lives under the same path +# for all workers. +_download_lock: t.Optional[_ProcessFileLock] = None + + def get_cache_duration() -> int: """Get cache duration from config, ensuring minimum of 24 hours.""" try: @@ -71,73 +191,42 @@ def get_cache_duration() -> int: def should_refresh_data(force_refresh: bool = False) -> tuple[bool, str]: - """Check if data should be refreshed and return reason.""" + """Decide whether to refresh IXP data. Only PeeringDB IXP prefixes are + considered relevant for startup refresh; BGP.tools bulk files are not used. + """ if force_refresh: return True, "Force refresh requested" - if not LAST_UPDATE_FILE.exists(): - return True, "No timestamp file found" + # No persistent backoff marker; decide refresh purely by file age / config + # and any transient network errors will be handled by the downloader's + # retry logic. - # Check each required file individually - if ANY are missing, refresh ALL - required_files = [ - (CIDR_DATA_FILE, "cidr_data.json"), - (ASN_DATA_FILE, "asn_data.json"), - (IXP_DATA_FILE, "ixp_data.json"), - ] + # If an IXP file exists, prefer it and do not perform automatic refreshes + # unless the caller explicitly requested a force refresh. + if IXP_PICKLE_FILE.exists() and not force_refresh: + return False, "ixp_data.json exists; skipping automatic refresh" - missing_files = [] - for file_path, file_name in required_files: - if not file_path.exists(): - missing_files.append(file_name) + # If IXP file is missing, refresh is needed + if not IXP_PICKLE_FILE.exists(): + return True, "No ixp_data.json present" - if missing_files: - return True, f"Missing data files: {', '.join(missing_files)}" - - # Check file age + # Otherwise check timestamp age try: with open(LAST_UPDATE_FILE, "r") as f: cached_time = datetime.fromisoformat(f.read().strip()) - age_seconds = (datetime.now() - cached_time).total_seconds() cache_duration = get_cache_duration() - if age_seconds >= cache_duration: age_hours = age_seconds / 3600 return True, f"Data expired (age: {age_hours:.1f}h, max: {cache_duration/3600:.1f}h)" - except Exception as e: + # If reading timestamp fails, prefer a refresh so we don't rely on stale data return True, f"Failed to read timestamp: {e}" return False, "Data is fresh" -def validate_data_files() -> tuple[bool, str]: - """Validate that data files contain reasonable data.""" - try: - # Check CIDR data - if CIDR_DATA_FILE.exists(): - with open(CIDR_DATA_FILE, "r") as f: - cidr_data = json.load(f) - if not isinstance(cidr_data, list) or len(cidr_data) < 1000: - return ( - False, - f"CIDR data invalid or too small: {len(cidr_data) if isinstance(cidr_data, list) else 'not a list'}", - ) - - # Check ASN data - if ASN_DATA_FILE.exists(): - with open(ASN_DATA_FILE, "r") as f: - asn_data = json.load(f) - if not isinstance(asn_data, dict) or len(asn_data) < 100: - return ( - False, - f"ASN data invalid or too small: {len(asn_data) if isinstance(asn_data, dict) else 'not a dict'}", - ) - - return True, "Data files are valid" - - except Exception as e: - return False, f"Data validation failed: {e}" +# validate_data_files removed - legacy BGP.tools bulk files are no longer used # Simple result classes @@ -187,12 +276,20 @@ class IPEnrichmentService: # Combined cache for ultra-fast loading self._combined_cache: t.Optional[t.Dict[str, t.Any]] = None + # Per-IP in-memory cache for bgp.tools lookups: ip -> (asn, asn_name, prefix, expires_at) + self._per_ip_cache: t.Dict[ + str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str], float] + ] = {} + # Small in-memory cache for per-IP lookups to avoid repeated websocket + # queries during runtime. Maps ip_str -> (asn, asn_name, prefix) + self._ip_cache: t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]] = {} + # Lock to serialize data load so concurrent callers don't duplicate work + self._ensure_lock = asyncio.Lock() def _optimize_lookups(self): """Convert IP networks to integer format for faster lookups.""" if self._lookup_optimized: return - log.debug("Optimizing IP lookup structures...") optimize_start = datetime.now() @@ -201,440 +298,861 @@ class IPEnrichmentService: for net_addr, prefixlen, asn, cidr_string in self.cidr_networks: if isinstance(net_addr, IPv4Address): - # Convert IPv4 to integer for fast bitwise operations net_int = int(net_addr) mask_bits = 32 - prefixlen self._ipv4_networks.append((net_int, mask_bits, asn, cidr_string)) else: - # Convert IPv6 to integer net_int = int(net_addr) mask_bits = 128 - prefixlen self._ipv6_networks.append((net_int, mask_bits, asn, cidr_string)) - # Sort by mask bits (ascending) for longest-match-first self._ipv4_networks.sort(key=lambda x: x[1]) self._ipv6_networks.sort(key=lambda x: x[1]) optimize_time = (datetime.now() - optimize_start).total_seconds() log.debug( - f"Optimized lookups: {len(self._ipv4_networks)} IPv4, {len(self._ipv6_networks)} IPv6 " - f"networks in {optimize_time:.2f}s" + f"Optimized lookups: {len(self._ipv4_networks)} IPv4, {len(self._ipv6_networks)} IPv6 (took {optimize_time:.2f}s)" ) self._lookup_optimized = True - def _save_combined_cache(self): - """Save all data structures to a single pickle file for ultra-fast loading.""" + def _try_load_pickle(self) -> bool: + """Attempt to load the optimized pickle from disk without triggering downloads. + + This is a best-effort, non-blocking load used during runtime lookups so + we don't attempt network refreshes or acquire process locks while + serving user requests. + """ try: - cache_data = { - "cidr_networks": self.cidr_networks, - "asn_info": self.asn_info, - "ixp_networks": self.ixp_networks, - "ipv4_networks": self._ipv4_networks, - "ipv6_networks": self._ipv6_networks, - "last_update": self.last_update, - "lookup_optimized": self._lookup_optimized, - } - - with open(COMBINED_CACHE_FILE, "wb") as f: - pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL) - - log.debug( - f"Saved combined cache with {len(self.cidr_networks)} CIDR entries to pickle file" - ) - except Exception as e: - log.error(f"Failed to save combined cache: {e}") - - def _load_combined_cache(self) -> bool: - """Load all data structures from pickle file.""" - if not COMBINED_CACHE_FILE.exists(): - return False - - try: - with open(COMBINED_CACHE_FILE, "rb") as f: - cache_data = pickle.load(f) - - self.cidr_networks = cache_data["cidr_networks"] - self.asn_info = cache_data["asn_info"] - self.ixp_networks = cache_data["ixp_networks"] - self._ipv4_networks = cache_data["ipv4_networks"] - self._ipv6_networks = cache_data["ipv6_networks"] - self.last_update = cache_data["last_update"] - self._lookup_optimized = cache_data["lookup_optimized"] - - log.debug( - f"Loaded combined cache with {len(self.cidr_networks)} CIDR entries from pickle file" - ) - return True - except Exception as e: - log.error(f"Failed to load combined cache: {e}") - return False + pickle_path = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + if pickle_path.exists(): + try: + with open(pickle_path, "rb") as f: + parsed = pickle.load(f) + if parsed and isinstance(parsed, list) and len(parsed) > 0: + self.ixp_networks = [ + (ip_address(net), prefixlen, name) for net, prefixlen, name in parsed + ] + log.debug( + "Loaded {} IXP prefixes from optimized pickle (non-blocking)", + len(self.ixp_networks), + ) + return True + except Exception as e: + log.debug("Non-blocking pickle load failed: {}", e) + except Exception: + pass + return False async def ensure_data_loaded(self, force_refresh: bool = False) -> bool: - """Ensure data is loaded and fresh from persistent files.""" + """Ensure data is loaded and fresh from persistent files. + + New behavior: only load PeeringDB IXP prefixes at startup. Do NOT bulk + download BGP.tools CIDR or ASN data. Per-IP ASN lookups will query the + bgp.tools API (websocket preferred) on-demand. + """ + # Create data directory if it doesn't exist IP_ENRICHMENT_DATA_DIR.mkdir(parents=True, exist_ok=True) - # Check if refresh is needed - should_refresh, reason = should_refresh_data(force_refresh) + # Lazily instantiate the process-wide download lock now that the + # data directory exists and is guaranteed to be the same path for + # all worker processes. + global _download_lock + if _download_lock is None: + _download_lock = _ProcessFileLock(IP_ENRICHMENT_DATA_DIR / "download.lock") - if not should_refresh: - # Validate existing data files - is_valid, validation_msg = validate_data_files() - if not is_valid: - should_refresh = True - reason = f"Data validation failed: {validation_msg}" - - if not should_refresh: - # Try to load from ultra-fast pickle cache first - if self._load_combined_cache(): - age_hours = ( - (datetime.now() - self.last_update).total_seconds() / 3600 - if self.last_update - else 0 - ) - log.info(f"Loading IP enrichment data from pickle cache (age: {age_hours:.1f}h)") - log.debug( - f"Cache contains: {len(self.cidr_networks)} CIDR entries, " - f"{len(self.asn_info)} ASN entries, {len(self.ixp_networks)} IXP networks" - ) - return True - - # Fallback to JSON files if pickle cache failed - try: - with open(CIDR_DATA_FILE, "r") as f: - cidr_data = json.load(f) - with open(ASN_DATA_FILE, "r") as f: - asn_data = json.load(f) - with open(IXP_DATA_FILE, "r") as f: - ixp_data = json.load(f) - with open(LAST_UPDATE_FILE, "r") as f: - cached_time = datetime.fromisoformat(f.read().strip()) - - age_hours = (datetime.now() - cached_time).total_seconds() / 3600 - log.info(f"Loading IP enrichment data from JSON files (age: {age_hours:.1f}h)") - log.debug( - f"Files contain: {len(cidr_data)} CIDR entries, " - f"{len(asn_data)} ASN entries, {len(ixp_data)} IXP networks" - ) - - # Convert string IP addresses back to IP objects - self.cidr_networks = [ - (ip_address(net), prefixlen, asn, cidr) - for net, prefixlen, asn, cidr in cidr_data - ] - # ASN data has integer keys that become strings in JSON - self.asn_info = {int(k): v for k, v in asn_data.items()} - self.ixp_networks = [ - (ip_address(net), prefixlen, name) for net, prefixlen, name in ixp_data - ] - self.last_update = cached_time - - # Reset optimization flag so it gets rebuilt with new data - self._lookup_optimized = False - - # Save to pickle cache for next time - self._optimize_lookups() - self._save_combined_cache() - - return True - - except Exception as e: - log.warning(f"Failed to load existing data files: {e} - will refresh") - should_refresh = True - reason = f"Failed to load files: {e}" - - # Download fresh data - log.info(f"Refreshing IP enrichment data: {reason}") - - if not httpx: - log.error("httpx not available - cannot download IP enrichment data") - return False - - try: - log.info("🌐 Starting fresh IP enrichment data download...") - download_start = datetime.now() - - async with httpx.AsyncClient(timeout=300) as client: - # Track which downloads succeeded - bgp_success = False - ixp_success = False - - # Try to download BGP data (required) - try: - await self._download_bgp_data(client) - bgp_success = True - log.debug("✅ BGP data download successful") - except Exception as e: - log.error(f"❌ BGP data download failed: {e}") - # BGP data is critical - if this fails, we can't continue - raise Exception(f"Critical BGP data download failed: {e}") - - # Try to download IXP data (optional but preferred) - try: - await self._download_ixp_data(client) - ixp_success = True - log.debug("✅ IXP data download successful") - except Exception as e: - log.error(f"❌ IXP data download failed: {e}") - # IXP data is optional - clear any partial data and continue - self.ixp_networks = [] - log.warning("Continuing without IXP data - IXP detection will be unavailable") - - download_duration = (datetime.now() - download_start).total_seconds() - - if not bgp_success: - # This shouldn't happen due to the raise above, but be explicit - raise Exception("BGP data download failed - cannot continue") - - log.info( - f"📊 Download summary: BGP data: ✅, IXP data: {'✅' if ixp_success else '❌'}" - ) - - # Continue with saving even if IXP failed... - - # Save the data to persistent files - log.debug("💾 Saving IP enrichment data to persistent files...") - cache_start = datetime.now() - - # Convert IP addresses to strings for JSON serialization - cidr_file_data = [ - (str(net), prefixlen, asn, cidr) for net, prefixlen, asn, cidr in self.cidr_networks - ] - ixp_file_data = [ - (str(net), prefixlen, name) for net, prefixlen, name in self.ixp_networks - ] - - with open(CIDR_DATA_FILE, "w") as f: - json.dump(cidr_file_data, f, separators=(",", ":")) # Compact JSON - with open(ASN_DATA_FILE, "w") as f: - json.dump(self.asn_info, f, separators=(",", ":")) - with open(IXP_DATA_FILE, "w") as f: - json.dump(ixp_file_data, f, separators=(",", ":")) - with open(LAST_UPDATE_FILE, "w") as f: - f.write(datetime.now().isoformat()) - - cache_duration_actual = (datetime.now() - cache_start).total_seconds() - - self.last_update = datetime.now() - - # Optimize lookups and create pickle cache for ultra-fast loading - self._lookup_optimized = False - self._optimize_lookups() - self._save_combined_cache() - - log.info(f"✅ IP enrichment data loaded successfully!") - log.info( - f"📊 Data summary: {len(self.cidr_networks)} CIDR entries, " - f"{len(self.asn_info)} ASN entries, {len(self.ixp_networks)} IXP networks" - ) - log.debug( - f"⏱️ Download time: {download_duration:.1f}s, Save time: {cache_duration_actual:.1f}s" - ) + # Fast-path: if already loaded in memory, return immediately + if self.ixp_networks: return True + # Serialize loads to avoid duplicate file reads when multiple callers + # call ensure_data_loaded concurrently. + async with self._ensure_lock: + # Double-check after acquiring the lock + if self.ixp_networks: + return True + + # Fast-path: if an optimized pickle exists and the caller did not + # request a forced refresh, load it (fastest). Fall back to the + # legacy JSON IXP file or downloads if the pickle is missing or + # invalid. This ensures the pickle is the preferred on-disk cache + # for faster startup. + try: + pickle_path = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + if pickle_path.exists() and not force_refresh: + try: + with open(pickle_path, "rb") as f: + parsed = pickle.load(f) + if parsed and isinstance(parsed, list) and len(parsed) > 0: + self.ixp_networks = [ + (ip_address(net), prefixlen, name) + for net, prefixlen, name in parsed + ] + log.info( + f"Loaded {len(self.ixp_networks)} IXP prefixes from optimized pickle (fast-path)" + ) + return True + else: + log.warning( + "Optimized pickle exists but appears empty or invalid; falling back to JSON/load or refresh" + ) + except Exception as e: + log.warning( + f"Failed to load optimized pickle {pickle_path}: {e}; falling back" + ) + except Exception: + # Non-fatal; continue to JSON/download logic + pass + + # Immediate guard: if an optimized pickle exists on disk and the + # caller did not request a forced refresh, prefer it and skip any + # network downloads. This keeps startup fast by loading the already + # generated optimized mapping. + try: + pickle_path = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + if pickle_path.exists() and not force_refresh: + try: + with open(pickle_path, "rb") as f: + parsed = pickle.load(f) + if parsed and isinstance(parsed, list) and len(parsed) > 0: + self.ixp_networks = [ + (ip_address(net), prefixlen, name) + for net, prefixlen, name in parsed + ] + log.info( + f"Loaded {len(self.ixp_networks)} IXP prefixes from optimized pickle (early guard)" + ) + return True + else: + log.warning( + "Optimized pickle exists but appears empty or invalid; will attempt to refresh" + ) + except Exception as e: + log.warning( + f"Failed to read optimized pickle: {e}; will attempt to refresh" + ) + except Exception: + # Ignore filesystem errors and continue to refresh logic + pass + + # No operator raw-dump conversion: rely on endpoint JSON files (ixpfx.json, + # ixlan.json, ix.json) in the data directory or download them from + # PeeringDB when a refresh is required. Determine whether we should + # refresh based on the backoff marker / cache duration. + should_refresh, reason = should_refresh_data(force_refresh) + + # If an optimized pickle exists, prefer it and avoid downloads unless forced. + try: + pickle_path = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + if pickle_path.exists(): + try: + st = pickle_path.stat() + size = getattr(st, "st_size", None) + except Exception: + size = None + + # If file size indicates non-empty file try to load + if size is not None and size > 0: + try: + with open(pickle_path, "rb") as f: + parsed = pickle.load(f) + except Exception as e: + log.warning(f"Failed to parse existing optimized IXP pickle: {e}") + parsed = None + + if parsed and isinstance(parsed, list) and len(parsed) > 0: + self.ixp_networks = [ + (ip_address(net), prefixlen, name) for net, prefixlen, name in parsed + ] + log.info( + f"Loaded {len(self.ixp_networks)} IXP prefixes from optimized pickle (size={size})" + ) + return True + else: + log.warning( + "Existing optimized pickle appears empty or invalid (size={}) ; will attempt to refresh", + size, + ) + else: + log.debug( + f"Optimized pickle exists but size indicates empty or very small (size={size})" + ) except Exception as e: - log.error(f"Failed to download IP enrichment data: {e}") + log.warning(f"Failed to load existing optimized IXP data: {e}") + + # If we're currently under a backoff or refresh is not required, skip downloading + if not should_refresh: + # If the optimized pickle is missing but the raw PeeringDB JSON files + # are present and the last_update timestamp is still within the + # configured cache duration, attempt to build the optimized pickle + # from the existing JSON files instead of downloading. + try: + pickle_path = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + json_paths = [ + IP_ENRICHMENT_DATA_DIR / "ixpfx.json", + IP_ENRICHMENT_DATA_DIR / "ixlan.json", + IP_ENRICHMENT_DATA_DIR / "ix.json", + ] + + have_all_json = all(p.exists() for p in json_paths) + if not pickle_path.exists() and have_all_json and LAST_UPDATE_FILE.exists(): + try: + with open(LAST_UPDATE_FILE, "r") as f: + cached_time = datetime.fromisoformat(f.read().strip()) + age_seconds = (datetime.now() - cached_time).total_seconds() + cache_duration = get_cache_duration() + if age_seconds < cache_duration: + log.info("Building optimized pickle from existing PeeringDB JSON files") + loop = asyncio.get_running_loop() + ok = await loop.run_in_executor(None, self._combine_peeringdb_files) + if ok and pickle_path.exists(): + # Load the generated pickle into memory + try: + with open(pickle_path, "rb") as f: + parsed = pickle.load(f) + self.ixp_networks = [ + (ip_address(net), prefixlen, name) + for net, prefixlen, name in parsed + ] + log.info( + "Loaded %d IXP prefixes from generated pickle", + len(self.ixp_networks), + ) + return True + except Exception as e: + log.warning(f"Failed to load generated pickle: {e}") + except Exception: + # If reading last_update fails, fall through to skipping refresh + pass + + except Exception: + # Non-fatal; proceed to skip refresh + pass + + log.info("Skipping IXP refresh: {}", reason) return False - async def _download_bgp_data(self, client) -> None: - """Download BGP.tools data.""" - log.info("📥 Downloading BGP.tools CIDR table from bgp.tools...") - download_start = datetime.now() - response = await client.get(BGP_TOOLS_TABLE_URL) - response.raise_for_status() - download_time = (datetime.now() - download_start).total_seconds() - - # Save raw file for debugging - with open(RAW_TABLE_FILE, "w") as f: - f.write(response.text) - - # Process JSONL data - process_start = datetime.now() - cidr_count = 0 - total_lines = len(response.text.strip().split("\n")) - log.debug(f"Processing {total_lines} lines from CIDR table...") - - for line in response.text.strip().split("\n"): - if line.strip(): - try: - entry = json.loads(line) - cidr = entry.get("CIDR") - asn = entry.get("ASN") - if cidr and asn: - network = ip_network(cidr, strict=False) - self.cidr_networks.append( - (network.network_address, network.prefixlen, asn, cidr) - ) - cidr_count += 1 - except Exception as e: - log.debug(f"Failed to parse CIDR line: {line[:100]} - {e}") - continue - - process_time = (datetime.now() - process_start).total_seconds() - log.info( - f"✅ Downloaded {cidr_count}/{total_lines} CIDR entries " - f"(download: {download_time:.1f}s, process: {process_time:.1f}s)" - ) - - # Sort by prefix length (descending) for longest-match lookup - sort_start = datetime.now() - self.cidr_networks.sort(key=lambda x: x[1], reverse=True) - sort_time = (datetime.now() - sort_start).total_seconds() - log.debug(f"Sorted CIDR entries by prefix length in {sort_time:.1f}s") - - # Download ASN names - log.info("📥 Downloading BGP.tools ASN names from bgp.tools...") - download_start = datetime.now() - response = await client.get(BGP_TOOLS_ASNS_URL) - response.raise_for_status() - download_time = (datetime.now() - download_start).total_seconds() - - # Save raw file for debugging - with open(RAW_ASNS_FILE, "w") as f: - f.write(response.text) - - # Process CSV data - process_start = datetime.now() - lines = response.text.strip().split("\n") - if not lines: - log.error("Empty ASN data received") - return - - # Debug: log the first few lines to see the format - log.debug(f"ASN CSV header: {lines[0] if lines else 'NO HEADER'}") - if len(lines) > 1: - log.debug(f"ASN CSV first data line: {lines[1]}") - - reader = csv.DictReader(lines) - asn_count = 0 - total_asns = 0 - failed_count = 0 - - for row in reader: - total_asns += 1 + # Acquire lock and refresh IXP list only + async with _download_lock: + # Double-check in case another worker refreshed try: - asn_str = row.get("asn", "").strip() - name = row.get("name", "").strip() - country = row.get("cc", "").strip() # Country code from CC column + # Double-check: if another worker already refreshed the IXP file + # while we were waiting for the lock, load it regardless of the + # general should_refresh flag. + if IXP_PICKLE_FILE.exists(): + try: + with open(IXP_PICKLE_FILE, "rb") as f: + parsed = pickle.load(f) + except Exception as e: + log.warning( + f"Existing optimized pickle is invalid after lock wait: {e}; will attempt to refresh" + ) + parsed = None - if not asn_str: - failed_count += 1 - continue + if not parsed or (isinstance(parsed, list) and len(parsed) == 0): + log.warning( + "Existing optimized pickle is empty after lock wait; will attempt to refresh", + ) + else: + self.ixp_networks = [ + (ip_address(net), prefixlen, name) for net, prefixlen, name in parsed + ] + log.info( + f"Loaded {len(self.ixp_networks)} IXP prefixes from optimized pickle (post-lock)" + ) + return True + except Exception: + pass - # Handle ASN formats like "AS12345" or just "12345" - if asn_str.upper().startswith("AS"): - asn = int(asn_str[2:]) - else: - asn = int(asn_str) + if not httpx: + log.error("httpx not available: cannot download PeeringDB prefixes") + return False - if asn > 0 and name: - self.asn_info[asn] = {"name": name, "country": country} - asn_count += 1 - else: - failed_count += 1 + try: + async with httpx.AsyncClient(timeout=30) as client: + await self._download_ixp_data(client) + # After download+combine, ensure we actually have prefixes and + # update the last-update marker. The combined pickle is already + # written by _combine_peeringdb_files invoked by _download_ixp_data. + if not self.ixp_networks or len(self.ixp_networks) == 0: + log.warning( + "Downloaded 0 IXP prefixes; keeping existing optimized pickle if present" + ) + # Even if no prefixes were combined, write a last-update + # marker so startup logic can see that a refresh was + # attempted and avoid endless retries. + try: + tmp_last = LAST_UPDATE_FILE.with_name(LAST_UPDATE_FILE.name + ".tmp") + with open(tmp_last, "w") as f: + f.write(datetime.now().isoformat()) + import os + + os.replace(tmp_last, LAST_UPDATE_FILE) + self.last_update = datetime.now() + except Exception: + log.debug("Failed to write last-update marker after empty IXP refresh") + return False + + # Update last update marker + tmp_last = LAST_UPDATE_FILE.with_name(LAST_UPDATE_FILE.name + ".tmp") + with open(tmp_last, "w") as f: + f.write(datetime.now().isoformat()) + import os + + os.replace(tmp_last, LAST_UPDATE_FILE) + + self.last_update = datetime.now() + log.info("Refreshed and saved {} IXP prefixes (pickle)", len(self.ixp_networks)) + return True except Exception as e: - failed_count += 1 - if failed_count < 5: # Only log first few failures - log.debug(f"Failed to parse ASN row {total_asns}: {row} - {e}") - continue + log.error("Failed to refresh IXP prefixes: {}", e) + # No persistent backoff behavior; log and return failure. + return False - process_time = (datetime.now() - process_start).total_seconds() - log.info( - f"✅ Downloaded {asn_count}/{total_asns} ASN entries with country codes " - f"(download: {download_time:.1f}s, process: {process_time:.1f}s, failed: {failed_count})" - ) + # end async with _ensure_lock async def _download_ixp_data(self, client) -> None: - """Download PeeringDB IXP prefixes data - simplified approach using only IXPFX.""" - log.info("📥 Downloading PeeringDB IXP prefixes from peeringdb.com...") + """Download and combine PeeringDB datasets: ixpfx, ixlan, ix. - max_retries = 3 - base_delay = 5 # Start with 5 second delay + Behavior: + - Download each endpoint to {name}.temp (e.g., ixpfx.temp). + - If download and JSON parsing succeed, atomically rename to {name}.json. + - If any download fails, leave existing {name}.json (if present) in place. + - After ensuring all three files exist (new or old), combine them into a + list of tuples (str(network_address), prefixlen, ixp_name), sorted by + prefixlen descending, and persist as a pickled file for fast loading. + """ + log.info("Downloading PeeringDB datasets: ixpfx, ixlan, ix") - for attempt in range(max_retries): + if not client: + log.error("HTTP client not available for PeeringDB downloads") + return + + endpoints = { + "ixpfx": "https://www.peeringdb.com/api/ixpfx", + "ixlan": "https://www.peeringdb.com/api/ixlan", + "ix": "https://www.peeringdb.com/api/ix", + } + + # Helper: fetch a URL exactly once. Do NOT retry on 429 or other + # errors - if PeeringDB is rate limiting the caller should decide + # whether to retry later. This prevents the service from reattempting + # downloads automatically and potentially worsening global rate limits. + async def _fetch_with_backoff(url: str): try: - if attempt > 0: - delay = base_delay * (2**attempt) # Exponential backoff - log.info(f"Retry attempt {attempt + 1}/{max_retries} after {delay}s delay...") - await asyncio.sleep(delay) + log.debug("Downloading PeeringDB endpoint {} (single attempt)", url) + resp = await client.get(url, timeout=30) - # Get IXP prefixes directly - no need for IXLAN lookup - log.debug("Downloading IXP prefixes...") - download_start = datetime.now() - response = await client.get(PEERINGDB_IXPFX_URL) - response.raise_for_status() - ixpfxs = response.json()["data"] - prefix_time = (datetime.now() - download_start).total_seconds() - - # Process IXP prefixes - use a generic IXP name since we don't need specific names - process_start = datetime.now() - ixp_count = 0 - total_prefixes = len(ixpfxs) - failed_prefixes = 0 - - for ixpfx in ixpfxs: - try: - prefix = ixpfx.get("prefix") - - if prefix: - network = ip_network(prefix, strict=False) - # Use "IXP Network" as generic name since we only need to know it's an IXP - ixp_name = "IXP Network" - self.ixp_networks.append( - (network.network_address, network.prefixlen, ixp_name) - ) - ixp_count += 1 - else: - failed_prefixes += 1 - except Exception: - failed_prefixes += 1 - - process_time = (datetime.now() - process_start).total_seconds() - - # Sort by prefix length (descending) for longest-match lookup - sort_start = datetime.now() - self.ixp_networks.sort(key=lambda x: x[1], reverse=True) - sort_time = (datetime.now() - sort_start).total_seconds() - - log.info( - f"✅ Downloaded {ixp_count}/{total_prefixes} IXP networks " - f"(download: {prefix_time:.1f}s, process: {process_time:.1f}s, " - f"sort: {sort_time:.1f}s, failed: {failed_prefixes})" - ) - return # Success - exit retry loop - - except Exception as e: - if "429" in str(e) or "Too Many Requests" in str(e): - if attempt < max_retries - 1: - delay = base_delay * (2 ** (attempt + 1)) - log.warning( - f"Rate limited by PeeringDB API (attempt {attempt + 1}/{max_retries}). Retrying in {delay}s..." - ) - continue - else: - log.error( - f"Rate limited by PeeringDB API after {max_retries} attempts. Skipping IXP data." - ) - break - else: + # Do not retry on 429 - treat as a failed download and return None + if resp.status_code != 200: log.warning( - f"Failed to download IXP data (attempt {attempt + 1}/{max_retries}): {e}" + "PeeringDB download failed for {}: HTTP {} - not retrying", + url, + resp.status_code, ) - if attempt < max_retries - 1: - continue - break + return None - # If we get here, all retries failed - log.warning("Could not download IXP data after retries - continuing without IXP detection") - log.info("ASN lookups will still work, but IXP networks won't be identified") - self.ixp_networks = [] + try: + return resp.json() + except Exception: + log.warning("Failed to parse JSON from {}", url) + return None + except Exception as e: + log.warning("PeeringDB download error for {}: {} - not retrying", url, e) + return None + + # Download each endpoint to .temp -> .json atomically + for name, url in endpoints.items(): + temp_path = IP_ENRICHMENT_DATA_DIR / f"{name}.temp" + final_path = IP_ENRICHMENT_DATA_DIR / f"{name}.json" + try: + data = await _fetch_with_backoff(url) + if not data: + log.warning( + "Failed to download {} (no data); will use existing {} if present", + url, + final_path, + ) + continue + + # Write to temp file first + try: + with open(temp_path, "w") as f: + json.dump(data, f, separators=(",", ":")) + # Atomic replace + import os + + os.replace(temp_path, final_path) + log.info("Saved PeeringDB dataset {} -> {}", name, final_path) + except Exception as e: + log.warning("Failed to write {}: {}", temp_path, e) + try: + if temp_path.exists(): + temp_path.unlink() + except Exception: + pass + except Exception as e: + log.warning( + "Failed to download {}: {}; will use existing {} if present", url, e, final_path + ) + + # After downloads, combine on-disk JSON files into the optimized pickle + # The actual combine logic is implemented in _combine_peeringdb_files so + # it can be reused (e.g., when the optimized pickle is missing but the + # raw JSON endpoint files are present and still fresh). + try: + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self._combine_peeringdb_files) + except Exception as e: + log.warning("Failed to combine PeeringDB datasets after download: {}", e) + + async def _query_bgp_tools_for_ip( + self, ip_str: str + ) -> t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]: + """Query bgp.tools for a single IP. Prefer websocket API; fallback to httpx. + + Returns (asn_int, asn_name, prefix) or (None, None, None) on failure. + """ + # Check cache first + if ip_str in self._ip_cache: + return self._ip_cache[ip_str] + + # Use TCP WHOIS bulk mode on bgp.tools:43. We'll perform a blocking + # socket WHOIS request in a thread executor to keep this function async. + + def _whois_blocking( + single_ips: t.List[str], + ) -> t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]]: + out: t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]] = {} + host = "bgp.tools" + port = 43 + # If a query is numeric-only we should send it as an ASN query (AS12345) + send_keys = [f"AS{q}" if q.isdigit() else q for q in single_ips] + payload = "begin\n" + "\n".join(send_keys) + "\nend\n" + try: + with socket.create_connection((host, port), timeout=10) as s: + s.settimeout(10) + s.sendall(payload.encode("utf-8")) + parts = [] + try: + while True: + chunk = s.recv(4096) + if not chunk: + break + parts.append(chunk) + except socket.timeout: + pass + + raw = b"".join(parts).decode("utf-8", errors="replace") + # Parse lines like: "13335 | 1.1.1.1 | 1.1.1.0/24 | US | ARIN | ... | Cloudflare, Inc." + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + # split by pipe if present, else whitespace + if "|" in line: + cols = [c.strip() for c in line.split("|")] + try: + asn = int(cols[0]) if cols[0].isdigit() else None + except Exception: + asn = None + ipcol = cols[1] if len(cols) > 1 else None + prefix = cols[2] if len(cols) > 2 else None + org = cols[-1] if len(cols) > 0 else None + if ipcol: + out[ipcol] = (asn, org, prefix) + else: + # ASN-only response (no IP column). Index by ASN too. + if asn is not None: + out_key1 = f"AS{asn}" + out_key2 = str(asn) + out[out_key1] = (asn, org, prefix) + out[out_key2] = (asn, org, prefix) + else: + # Fallback parsing: "AS12345 ip prefix org" + parts_line = line.split() + if len(parts_line) >= 3: + try: + asn = int(parts_line[0]) + except Exception: + asn = None + ipcol = parts_line[1] + prefix = parts_line[2] + org = " ".join(parts_line[3:]) if len(parts_line) > 3 else None + if ipcol: + out[ipcol] = (asn, org, prefix) + else: + if asn is not None: + out_key1 = f"AS{asn}" + out_key2 = str(asn) + out[out_key1] = (asn, org, prefix) + out[out_key2] = (asn, org, prefix) + # Map results back to the original query keys. For numeric + # inputs we sent 'AS{n}', but callers may provide 'n'. Ensure + # we return entries keyed by the original queries. + mapped: t.Dict[ + str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]] + ] = {} + for orig, sent in zip(single_ips, send_keys): + if sent in out: + mapped[orig] = out[sent] + elif orig in out: + mapped[orig] = out[orig] + else: + # Try ASN variants + if orig.isdigit(): + if f"AS{orig}" in out: + mapped[orig] = out[f"AS{orig}"] + elif orig in out: + mapped[orig] = out[orig] + else: + mapped[orig] = (None, None, None) + else: + mapped[orig] = (None, None, None) + return mapped + except Exception: + # On any socket/connect error return empties for all requested IPs + for ip in single_ips: + out[ip] = (None, None, None) + return out + + loop = asyncio.get_running_loop() + resp_map = await loop.run_in_executor(None, _whois_blocking, [ip_str]) + asn, org, prefix = resp_map.get(ip_str, (None, None, None)) + # Cache result + self._ip_cache[ip_str] = (asn, org, prefix) + return (asn, org, prefix) + + def _combine_peeringdb_files(self) -> bool: + """Combine existing PeeringDB JSON files into the optimized pickle. + + Reads ixpfx.json, ixlan.json, ix.json from the data directory (if + present), builds a deduplicated prefix->IXP name mapping, sorts by + prefix length (desc) and persists the result to ixp_data.pickle + atomically. Returns True on success, False otherwise. + """ + try: + ixpfx_data = [] + ixlan_data = [] + ix_data = [] + + if (IP_ENRICHMENT_DATA_DIR / "ixpfx.json").exists(): + with open(IP_ENRICHMENT_DATA_DIR / "ixpfx.json", "r") as f: + raw = json.load(f) + if isinstance(raw, dict) and "data" in raw: + ixpfx_data = raw.get("data", []) + elif isinstance(raw, list): + ixpfx_data = raw + + if (IP_ENRICHMENT_DATA_DIR / "ixlan.json").exists(): + with open(IP_ENRICHMENT_DATA_DIR / "ixlan.json", "r") as f: + raw = json.load(f) + if isinstance(raw, dict) and "data" in raw: + ixlan_data = raw.get("data", []) + elif isinstance(raw, list): + ixlan_data = raw + + if (IP_ENRICHMENT_DATA_DIR / "ix.json").exists(): + with open(IP_ENRICHMENT_DATA_DIR / "ix.json", "r") as f: + raw = json.load(f) + if isinstance(raw, dict) and "data" in raw: + ix_data = raw.get("data", []) + elif isinstance(raw, list): + ix_data = raw + + # Build mappings: ixlan_id -> ix_id, ix_id -> ix_name + ixlan_to_ix = {} + for rec in ixlan_data: + try: + rid = rec.get("id") + ix_id = rec.get("ix_id") + if rid is not None and ix_id is not None: + ixlan_to_ix[rid] = ix_id + except Exception: + continue + + ix_id_to_name = {} + for rec in ix_data: + try: + ixid = rec.get("id") + name = rec.get("name_long") or rec.get("name") + if ixid is not None and name: + ix_id_to_name[ixid] = name + except Exception: + continue + + # Combine prefixes to IXP name + prefix_map: dict[str, str] = {} + for rec in ixpfx_data: + try: + prefix = rec.get("prefix") or rec.get("network") + ixlan_id = rec.get("ixlan_id") + if not prefix: + continue + ix_id = ixlan_to_ix.get(ixlan_id) + ix_name = None + if ix_id is not None: + ix_name = ix_id_to_name.get(ix_id) + # Fallback: some ixpfx entries include ix_name or ixlan name + if not ix_name: + ix_name = rec.get("name") or rec.get("ixp_name") + if not ix_name: + ix_name = "IXP" + # Normalize network + try: + net = ip_network(prefix, strict=False) + prefix_map[str(net)] = ix_name + except Exception: + # store raw prefix if parsing fails + prefix_map[prefix] = ix_name + except Exception: + continue + + # Build sorted list of tuples: (network_address_str, prefixlen, ix_name) + parsed = [] + for pfx, name in prefix_map.items(): + try: + net = ip_network(pfx, strict=False) + parsed.append((str(net.network_address), net.prefixlen, name)) + except Exception: + # try to skip invalid entries + continue + + # Sort by prefixlen desc + parsed.sort(key=lambda x: x[1], reverse=True) + + # Persist parsed mapping as pickle for performance + tmp_pickle = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle.tmp" + final_pickle = IP_ENRICHMENT_DATA_DIR / "ixp_data.pickle" + try: + with open(tmp_pickle, "wb") as f: + pickle.dump(parsed, f, protocol=pickle.HIGHEST_PROTOCOL) + import os + + os.replace(tmp_pickle, final_pickle) + log.info( + "Saved combined IXP prefix mapping (%d prefixes) -> %s", + len(parsed), + final_pickle, + ) + # Also update in-memory list for immediate use + self.ixp_networks = [ + (ip_address(net), prefixlen, name) for net, prefixlen, name in parsed + ] + return True + except Exception as e: + log.warning("Failed to persist optimized pickle: %s", e) + return False + + except Exception as e: + log.warning("Failed to combine PeeringDB datasets: %s", e) + return False + + async def _query_bgp_tools_bulk( + self, ips: t.List[str] + ) -> t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]]: + """Query bgp.tools for multiple IPs using a single websocket connection when possible. + + Returns a mapping ip -> (asn, asn_name, prefix). + """ + results: t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]] = {} + + # Implement TCP WHOIS bulk mode against bgp.tools:43. Perform the + # blocking socket work in a thread executor so async callers are not + # blocked. + + def _whois_bulk_blocking( + bulk_ips: t.List[str], + ) -> t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]]: + host = "bgp.tools" + port = 43 + out: t.Dict[str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]]] = {} + # Normalize numeric-only queries to ASN form for the WHOIS service + send_keys = [f"AS{q}" if q.isdigit() else q for q in bulk_ips] + payload = "begin\n" + "\n".join(send_keys) + "\nend\n" + try: + with socket.create_connection((host, port), timeout=15) as s: + s.settimeout(15) + s.sendall(payload.encode("utf-8")) + parts = [] + try: + while True: + chunk = s.recv(8192) + if not chunk: + break + parts.append(chunk) + if sum(len(p) for p in parts) > 512 * 1024: + # safety cap 512KB + break + except socket.timeout: + pass + + raw = b"".join(parts).decode("utf-8", errors="replace") + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + if "|" in line: + cols = [c.strip() for c in line.split("|")] + try: + asn = int(cols[0]) if cols[0].isdigit() else None + except Exception: + asn = None + ipcol = cols[1] if len(cols) > 1 else None + prefix = cols[2] if len(cols) > 2 else None + org = cols[-1] if len(cols) > 0 else None + if ipcol: + out[ipcol] = (asn, org, prefix) + else: + # ASN-only response (no IP column). Index by ASN too. + if asn is not None: + out_key1 = f"AS{asn}" + out_key2 = str(asn) + out[out_key1] = (asn, org, prefix) + out[out_key2] = (asn, org, prefix) + else: + parts_line = line.split() + if len(parts_line) >= 3: + try: + asn = int(parts_line[0]) + except Exception: + asn = None + ipcol = parts_line[1] + prefix = parts_line[2] + org = " ".join(parts_line[3:]) if len(parts_line) > 3 else None + out[ipcol] = (asn, org, prefix) + + # Map results back to original query keys + mapped: t.Dict[ + str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]] + ] = {} + for orig, sent in zip(bulk_ips, send_keys): + if sent in out: + mapped[orig] = out[sent] + elif orig in out: + mapped[orig] = out[orig] + else: + # Try ASN variants for numeric orig + if orig.isdigit(): + if f"AS{orig}" in out: + mapped[orig] = out[f"AS{orig}"] + elif orig in out: + mapped[orig] = out[orig] + else: + mapped[orig] = (None, None, None) + else: + mapped[orig] = (None, None, None) + return mapped + except Exception: + for ip in bulk_ips: + out[ip] = (None, None, None) + return out + + loop = asyncio.get_running_loop() + resp = await loop.run_in_executor(None, _whois_bulk_blocking, ips) + return resp + + async def lookup_ips_bulk(self, ips: t.List[str]) -> t.Dict[str, IPInfo]: + """Bulk lookup for multiple IPs, using local data first and bgp.tools bulk queries for misses.""" + results: t.Dict[str, IPInfo] = {} + + # Try a fast, non-blocking load of the optimized pickle; do NOT + # attempt a network refresh or acquire the download lock here since + # this function is called during request handling. If the pickle + # cannot be loaded, proceed with bgp.tools lookups only. + if not self.ixp_networks: + self._try_load_pickle() + + # Prepare misses + misses: t.List[str] = [] + for ip in ips: + try: + target_ip = ip_address(ip) + except Exception: + results[ip] = IPInfo(ip) + continue + + # private/reserved + if target_ip.is_private or target_ip.is_reserved or target_ip.is_loopback: + results[ip] = IPInfo(ip, asn=0, asn_name="Private", prefix="Private Network") + continue + + # check IXP + found_ixp = False + for net_addr, prefixlen, ixp_name in self.ixp_networks: + try: + network = ip_network(f"{net_addr}/{prefixlen}", strict=False) + if target_ip in network: + results[ip] = IPInfo(ip, is_ixp=True, ixp_name=ixp_name) + found_ixp = True + break + except Exception: + continue + if found_ixp: + continue + + # try local optimized tables + if not self._lookup_optimized: + self._optimize_lookups() + + matched = False + target_int = int(target_ip) + if isinstance(target_ip, IPv4Address): + for net_int, mask_bits, asn, cidr_string in self._ipv4_networks: + if (target_int >> mask_bits) == (net_int >> mask_bits): + asn_data = self.asn_info.get(asn, {}) + asn_name = asn_data.get("name", f"AS{asn}") + country = asn_data.get("country", "") + results[ip] = IPInfo( + ip, asn=asn, asn_name=asn_name, prefix=cidr_string, country=country + ) + matched = True + break + else: + for net_int, mask_bits, asn, cidr_string in self._ipv6_networks: + if (target_int >> mask_bits) == (net_int >> mask_bits): + asn_data = self.asn_info.get(asn, {}) + asn_name = asn_data.get("name", f"AS{asn}") + country = asn_data.get("country", "") + results[ip] = IPInfo( + ip, asn=asn, asn_name=asn_name, prefix=cidr_string, country=country + ) + matched = True + break + + if not matched: + misses.append(ip) + + # Query bgp.tools in bulk for misses + if misses: + bulk = await self._query_bgp_tools_bulk(misses) + for ip in misses: + asn, asn_name, prefix = bulk.get(ip, (None, None, None)) + if asn: + try: + self.asn_info[int(asn)] = {"name": asn_name or f"AS{asn}", "country": ""} + except Exception: + pass + results[ip] = IPInfo(ip, asn=asn, asn_name=asn_name, prefix=prefix) + else: + results[ip] = IPInfo(ip, asn=0, asn_name="Unknown") + + return results async def lookup_ip(self, ip_str: str) -> IPInfo: """Lookup an IP address and return ASN or IXP information.""" - if not await self.ensure_data_loaded(): - log.warning("IP enrichment data not available") - return IPInfo(ip_str) + # Try to load IXP data, but continue even if the load fails. We still + # want to perform on-demand bgp.tools lookups for IPs when local data + # is missing; failing to load the IXP file should not prevent remote + # lookups. + try: + if not self.ixp_networks: + # Attempt a non-blocking pickle load only; don't trigger + # downloads or acquire locks while handling requests. + self._try_load_pickle() + except Exception: + log.debug("Non-blocking data load failed; continuing with on-demand lookups") # Ensure lookup optimization is done self._optimize_lookups() @@ -680,6 +1198,31 @@ class IPEnrichmentService: return IPInfo( ip_str, asn=asn, asn_name=asn_name, prefix=cidr_string, country=country ) + # Not found in local tables - do an on-demand query to bgp.tools + try: + asn, asn_name, prefix = await self._query_bgp_tools_for_ip(ip_str) + if asn: + # Update asn_info cache (best-effort) + try: + self.asn_info[int(asn)] = {"name": asn_name or f"AS{asn}", "country": ""} + except Exception: + pass + return IPInfo(ip_str, asn=asn, asn_name=asn_name, prefix=prefix) + except Exception: + pass + # Not found locally - try one-off query + try: + asn, asn_name, prefix = asyncio.get_event_loop().run_until_complete( + self._query_bgp_tools_for_ip(ip_str) + ) + if asn: + try: + self.asn_info[int(asn)] = {"name": asn_name or f"AS{asn}", "country": ""} + except Exception: + pass + return IPInfo(ip_str, asn=asn, asn_name=asn_name, prefix=prefix) + except Exception: + pass else: # Use optimized IPv6 lookup for net_int, mask_bits, asn, cidr_string in self._ipv6_networks: @@ -700,16 +1243,47 @@ class IPEnrichmentService: async def lookup_asn_name(self, asn: int) -> str: """Get the organization name for an ASN.""" - if not await self.ensure_data_loaded(): - return f"AS{asn}" + # Attempt to load data but don't fail if we can't; fall back to + # returning the numeric ASN string if we have no cached name. + try: + await self.ensure_data_loaded() + except Exception: + log.debug( + "ensure_data_loaded raised an exception while getting ASN name; using cached data if present" + ) asn_data = self.asn_info.get(asn, {}) - return asn_data.get("name", f"AS{asn}") + name = asn_data.get("name") + if name: + return name + + # Fallback: query bgp.tools via WHOIS bulk for ASN (e.g., 'AS12345') + try: + query = f"AS{asn}" + resp = await self._query_bgp_tools_bulk([query]) + # resp maps 'AS12345' -> (asn_int, org, prefix) or maps '12345' -> ... + entry = resp.get(query) or resp.get(str(asn)) + if entry: + a, org, _ = entry + if org: + try: + self.asn_info[int(asn)] = {"name": org, "country": ""} + except Exception: + pass + return org + except Exception: + pass + + return f"AS{asn}" async def lookup_asn_country(self, asn: int) -> str: """Get the country code for an ASN.""" - if not await self.ensure_data_loaded(): - return "" + try: + await self.ensure_data_loaded() + except Exception: + log.debug( + "ensure_data_loaded raised an exception while getting ASN country; using cached data if present" + ) asn_data = self.asn_info.get(asn, {}) return asn_data.get("country", "") @@ -788,6 +1362,7 @@ async def lookup_ip(ip_address: str) -> IPInfo: async def lookup_asn_name(asn: int) -> str: """Get the organization name for an ASN number.""" + # ASN lookups do not require loading PeeringDB data; perform direct lookup return await _service.lookup_asn_name(asn) @@ -806,24 +1381,68 @@ async def lookup_asns_bulk(asns: t.List[t.Union[str, int]]) -> t.Dict[str, t.Dic Dict mapping ASN string to {"name": org_name, "country": country_code} Example: {"12345": {"name": "Example ISP", "country": "US"}} """ - await _service.ensure_data_loaded() - + # Do NOT load PeeringDB data for ASN-only lookups; these use bgp.tools WHOIS + # and the in-memory `_service.asn_info` cache only. This avoids triggering + # PeeringDB downloads when callers only need ASN org names. results = {} + + # Normalize ASN list to strings and filter invalids + requested: list[str] = [] for asn in asns: - # Skip non-numeric ASNs like "IXP" - if asn == "IXP" or asn is None: + if asn is None: + continue + if str(asn) == "IXP": + continue + try: + _ = int(asn) + requested.append(str(asn)) + except (ValueError, TypeError): continue + # Ensure we have the data loaded + # Identify ASNs missing a human-friendly name so we can attempt a live WHOIS + missing: list[str] = [] + for asn in requested: + try: + ai = _service.asn_info.get(int(asn), {}) + name = ai.get("name") if isinstance(ai, dict) else None + if not name or name == f"AS{asn}": + missing.append(asn) + except Exception: + missing.append(asn) + + # If we have missing ASNs, try a live bgp.tools WHOIS bulk query to fetch org names + if missing and hasattr(_service, "_query_bgp_tools_bulk"): + try: + log.debug("lookup_asns_bulk: querying bgp.tools for missing ASNs: {}", missing) + queries = [f"AS{a}" for a in missing] + resp = await _service._query_bgp_tools_bulk(queries) + # resp maps query -> (asn_int, org, prefix) + for asn in missing: + q = f"AS{asn}" + entry = resp.get(q) or resp.get(str(asn)) + if entry: + _, org, _ = entry + if org: + try: + _service.asn_info[int(asn)] = {"name": org, "country": ""} + log.debug("lookup_asns_bulk: updated asn_info[{}] = {}", asn, org) + except Exception: + pass + except Exception as e: + log.debug("lookup_asns_bulk: bgp.tools lookup failed: {}", e) + + # Build final results from asn_info (may include newly-populated entries) + for asn in requested: try: asn_int = int(asn) asn_data = _service.asn_info.get(asn_int, {}) - results[str(asn)] = { + results[asn] = { "name": asn_data.get("name", f"AS{asn}"), "country": asn_data.get("country", ""), } - except (ValueError, TypeError): - # Skip invalid ASN values - continue + except Exception: + results[asn] = {"name": f"AS{asn}", "country": ""} return results @@ -831,6 +1450,26 @@ async def lookup_asns_bulk(asns: t.List[t.Union[str, int]]) -> t.Dict[str, t.Dic async def refresh_ip_enrichment_data(force: bool = False) -> bool: """Manually refresh IP enrichment data.""" log.info(f"Manual refresh requested (force={force})") + # Respect configuration: if IP enrichment is disabled, do not attempt + # to refresh or download PeeringDB data. This prevents manual or UI- + # triggered refreshes from hitting the network when the feature is + # administratively turned off. + try: + params = use_state("params") + if ( + not getattr(params, "structured", None) + or not params.structured.ip_enrichment.enrich_traceroute + or getattr(params.structured, "enable_for_traceroute", None) is False + ): + log.debug( + "IP enrichment for traceroute is disabled in configuration; skipping manual refresh" + ) + return False + except Exception: + # If we can't read config for some reason, proceed with refresh to + # avoid silently ignoring an admin's request. + pass + return await _service.ensure_data_loaded(force_refresh=force) @@ -839,20 +1478,16 @@ def get_data_status() -> dict: status = { "data_directory": str(IP_ENRICHMENT_DATA_DIR), "files_exist": { - "cidr_data": CIDR_DATA_FILE.exists(), - "asn_data": ASN_DATA_FILE.exists(), - "ixp_data": IXP_DATA_FILE.exists(), + "ixp_data_pickle": IXP_PICKLE_FILE.exists(), "last_update": LAST_UPDATE_FILE.exists(), - "combined_cache": COMBINED_CACHE_FILE.exists(), - "raw_table": RAW_TABLE_FILE.exists(), - "raw_asns": RAW_ASNS_FILE.exists(), }, "last_update": None, "age_hours": None, "data_counts": { - "cidr_entries": len(_service.cidr_networks), - "asn_entries": len(_service.asn_info), - "ixp_networks": len(_service.ixp_networks), + # Prefer the in-memory count when available; otherwise try to + # inspect the optimized pickle on disk so status is accurate + # across multiple worker processes. + "ixp_networks": len(_service.ixp_networks) if _service.ixp_networks else None, }, } @@ -865,6 +1500,22 @@ def get_data_status() -> dict: except Exception: pass + # If in-memory count is empty (likely this worker hasn't loaded the + # pickle), attempt to read the optimized pickle on disk to compute a + # reliable count for the status endpoint without mutating service state. + if status["data_counts"].get("ixp_networks") in (None, 0) and IXP_PICKLE_FILE.exists(): + try: + with open(IXP_PICKLE_FILE, "rb") as f: + parsed = pickle.load(f) + if isinstance(parsed, list): + status["data_counts"]["ixp_networks"] = len(parsed) + else: + status["data_counts"]["ixp_networks"] = 0 + except Exception: + # If reading the pickle fails, leave the previously reported + # value (None or 0). This avoids crashing the status endpoint. + pass + return status @@ -960,44 +1611,45 @@ async def network_info(*targets: str) -> TargetData: return default_data try: - _log.info(f"Enriching {len(query_targets)} IP addresses") - - # Load data ONCE for all lookups - await _service.ensure_data_loaded() + _log.info(f"Enriching {len(query_targets)} IP addresses using bulk lookup") + # Use the bulk lookup to query bgp.tools efficiently query_data = {} + bulk_results = await _service.lookup_ips_bulk(query_targets) - # Process each target without reloading data - for target in query_targets: - ip_info = _service.lookup_ip_direct( - target - ) # Use direct lookup that doesn't reload data - + for target, ip_info in bulk_results.items(): # Convert to TargetDetail format if ip_info.is_ixp and ip_info.ixp_name: - # IXP case - put "IXP" in ASN field and IXP name in org field detail: TargetDetail = { - "asn": "IXP", # Show "IXP" as the ASN for IXPs + "asn": "IXP", "ip": target, "prefix": "None", "country": "None", - "rir": "IXP", # Mark as IXP in RIR field + "rir": "IXP", "allocated": "None", "org": ip_info.ixp_name, } - elif ip_info.asn is not None: - # ASN case - normal network - return just the NUMBER, no AS prefix + elif ip_info.asn is not None and ip_info.asn != 0: detail = { - "asn": str(ip_info.asn), # Just the number as string, e.g. "12345" + "asn": str(ip_info.asn), "ip": target, - "prefix": ip_info.prefix or "None", # Use the CIDR from table.jsonl - "country": ip_info.country or "None", # Use country code from asns.csv - "rir": "UNKNOWN", # Not available from our enrichment - "allocated": "None", # Not available from our enrichment + "prefix": ip_info.prefix or "None", + "country": ip_info.country or "None", + "rir": "UNKNOWN", + "allocated": "None", "org": ip_info.asn_name or "None", } + elif ip_info.asn == 0: + detail = { + "asn": "None", + "ip": target, + "prefix": "None", + "country": "None", + "rir": "Unknown", + "allocated": "None", + "org": "None", + } else: - # No match found detail = { "asn": "None", "ip": target, diff --git a/hyperglass/main.py b/hyperglass/main.py index 5a5f0f0..02c85ad 100644 --- a/hyperglass/main.py +++ b/hyperglass/main.py @@ -32,6 +32,7 @@ if node_major < MIN_NODE_VERSION: from .util import cpu_count from .state import use_state from .settings import Settings +import os LOG_LEVEL = logging.INFO if Settings.debug is False else logging.DEBUG logging.basicConfig(handlers=[LibInterceptHandler()], level=0, force=True) @@ -155,10 +156,20 @@ def run(workers: int = None): _workers = workers if workers is None: - if Settings.debug: - _workers = 1 + # Allow environment override (useful for Docker Compose): + # HYPERGLASS_WORKERS=n + env_workers = os.getenv("HYPERGLASS_WORKERS") + if env_workers: + try: + _workers = max(1, int(env_workers)) + except Exception: + # Fall back to defaults on parse error + _workers = 1 if Settings.debug else cpu_count(2) else: - _workers = cpu_count(2) + if Settings.debug: + _workers = 1 + else: + _workers = cpu_count(2) log.bind( version=__version__, diff --git a/hyperglass/models/api/query.py b/hyperglass/models/api/query.py index 3b941fa..f46d923 100644 --- a/hyperglass/models/api/query.py +++ b/hyperglass/models/api/query.py @@ -117,7 +117,49 @@ class Query(BaseModel): @property def device(self) -> Device: """Get this query's device object by query_location.""" - return self._state.devices[self.query_location] + # Return a proxy around the device so we can override + # structured_output per-request without mutating global state. + device = self._state.devices[self.query_location] + + # Determine effective structured_output based on global params + try: + params = use_state("params") + except Exception: + params = None + + # Decide which top-level structured enable flag to consult + feature_flag_name = None + if getattr(self, "query_type", None) == "traceroute": + feature_flag_name = "enable_for_traceroute" + elif getattr(self, "query_type", None) in ("bgp_route", "bgp_routestr"): + feature_flag_name = "enable_for_bgp_route" + + effective_structured = bool(getattr(device, "structured_output", False)) + + if params is None or not getattr(params, "structured", None): + # Global structured block absent => structured disabled + effective_structured = False + else: + # If structured is present, default is enabled; allow opt-out + if feature_flag_name is not None: + if getattr(params.structured, feature_flag_name, None) is False: + effective_structured = False + + class _DeviceProxy: + """Tiny proxy object that delegates to the real device but + overrides structured_output.""" + + def __init__(self, real, structured_value: bool) -> None: + self._real = real + self.structured_output = structured_value + + def __getattr__(self, name: str): + return getattr(self._real, name) + + def __repr__(self) -> str: # pragma: no cover - trivial + return repr(self._real) + + return _DeviceProxy(device, effective_structured) @field_validator("query_location") def validate_query_location(cls, value): diff --git a/hyperglass/models/config/params.py b/hyperglass/models/config/params.py index 1e35ba2..5d3506d 100644 --- a/hyperglass/models/config/params.py +++ b/hyperglass/models/config/params.py @@ -87,7 +87,7 @@ class Params(ParamsPublic, HyperglassModel): docs: Docs = Docs() logging: Logging = Logging() messages: Messages = Messages() - structured: Structured = Structured() + structured: t.Optional[Structured] = None web: Web = Web() def __init__(self, **kw: t.Any) -> None: diff --git a/hyperglass/models/config/structured.py b/hyperglass/models/config/structured.py index 13195dc..63c86b3 100644 --- a/hyperglass/models/config/structured.py +++ b/hyperglass/models/config/structured.py @@ -39,12 +39,14 @@ class StructuredRpki(HyperglassModel): class StructuredIpEnrichment(HyperglassModel): - """Control IP enrichment for structured data responses.""" + """Control IP enrichment for structured data responses. + + Two tri-state flags are provided to allow the presence of a `structured:` + config block to imply the features are enabled, while still allowing users + to explicitly disable them. + """ - enabled: bool = False cache_timeout: int = 86400 # 24 hours in seconds (minimum) - enrich_next_hop: bool = False - enrich_traceroute: bool = True @field_validator("cache_timeout") def validate_cache_timeout(cls, value: int) -> int: @@ -53,6 +55,14 @@ class StructuredIpEnrichment(HyperglassModel): return 86400 return value + enrich_traceroute: bool = True + """Enable ASN/org/IP enrichment for traceroute hops. + + This option remains under `structured.ip_enrichment` per-user request and + must be True (in addition to top-level structured presence and + `structured.enable_for_traceroute` not being False) for enrichment to run. + """ + class Structured(HyperglassModel): """Control structured data responses.""" @@ -60,3 +70,10 @@ class Structured(HyperglassModel): communities: StructuredCommunities = StructuredCommunities() rpki: StructuredRpki = StructuredRpki() ip_enrichment: StructuredIpEnrichment = StructuredIpEnrichment() + + # Top-level structured enable/disable flags. If `structured:` is present in + # the user's config and these are not set (None), the structured table + # output is considered enabled by default. Setting them to False disables + # the structured table output even when a `structured:` block exists. + enable_for_traceroute: t.Optional[bool] = None + enable_for_bgp_route: t.Optional[bool] = None diff --git a/hyperglass/models/data/traceroute.py b/hyperglass/models/data/traceroute.py index 2ccde86..e2d929b 100644 --- a/hyperglass/models/data/traceroute.py +++ b/hyperglass/models/data/traceroute.py @@ -5,7 +5,7 @@ import typing as t from ipaddress import ip_address, AddressValueError # Third Party -from pydantic import field_validator +from pydantic import field_validator, computed_field # Project from hyperglass.external.ip_enrichment import TargetDetail @@ -58,6 +58,7 @@ class TracerouteHop(HyperglassModel): """Get the IP address for display purposes (may be truncated).""" return self.display_ip or self.ip_address + @computed_field @property def avg_rtt(self) -> t.Optional[float]: """Calculate average RTT from available measurements.""" diff --git a/hyperglass/models/parsing/mikrotik.py b/hyperglass/models/parsing/mikrotik.py index 16e3b5c..82517c4 100644 --- a/hyperglass/models/parsing/mikrotik.py +++ b/hyperglass/models/parsing/mikrotik.py @@ -341,19 +341,47 @@ class MikrotikTracerouteTable(MikrotikBase): """ _log = log.bind(parser="MikrotikTracerouteTable") - # DEBUG: Log the raw input - _log.debug(f"=== RAW MIKROTIK TRACEROUTE INPUT ===") - _log.debug(f"Target: {target}, Source: {source}") - _log.debug(f"Raw text length: {len(text)} characters") - _log.debug(f"Raw text:\n{repr(text)}") - _log.debug(f"=== END RAW INPUT ===") + # Minimal input summary to avoid excessive logs while keeping context + _log.debug( + "Parsing MikroTik traceroute", + target=target, + source=source, + lines=len(text.splitlines()), + ) + + # Try to extract target from the traceroute command in the output + # Look for patterns like: "tool traceroute src-address=192.168.1.1 timeout=1 duration=30 count=3 8.8.8.8" + lines = text.split("\n") + extracted_target = target # Default to passed target + + for line in lines[:10]: # Check first 10 lines for command + line = line.strip() + if line.startswith("tool traceroute") or "traceroute" in line: + # Extract target from command line - it's typically the last argument + parts = line.split() + for part in reversed(parts): + # Skip parameters with = signs and common flags + if ( + "=" not in part + and not part.startswith("-") + and not part.startswith("[") + and part + not in ["tool", "traceroute", "src-address", "timeout", "duration", "count"] + ): + # This looks like a target (IP or hostname) + if len(part) > 3: # Reasonable minimum length + extracted_target = part + break + break + + # Use extracted target if found, otherwise keep the passed target + if extracted_target != target: + _log.info( + f"Updated target from '{target}' to '{extracted_target}' based on command output" + ) + target = extracted_target lines = text.strip().split("\n") - _log.debug(f"Split into {len(lines)} lines") - - # DEBUG: Log each line with line numbers - for i, line in enumerate(lines): - _log.debug(f"Line {i:2d}: {repr(line)}") # Find all table starts - handle both formats: # Format 1: "Columns: ADDRESS, LOSS, SENT..." (newer format with hop numbers) @@ -367,7 +395,6 @@ class MikrotikTracerouteTable(MikrotikBase): and not line.strip().startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9")) ): table_starts.append(i) - _log.debug(f"Found table start at line {i}: {repr(line)}") if not table_starts: _log.warning("No traceroute table headers found in output") @@ -376,14 +403,15 @@ class MikrotikTracerouteTable(MikrotikBase): # Take the LAST table (newest/final results) last_table_start = table_starts[-1] _log.debug( - f"Found {len(table_starts)} tables, using the last one starting at line {last_table_start}" + "Found traceroute tables", + tables_found=len(table_starts), + last_table_start=last_table_start, ) # Determine format by checking the header line header_line = lines[last_table_start].strip() is_columnar_format = "Columns:" in header_line - _log.debug(f"Header line: {repr(header_line)}") - _log.debug(f"Is columnar format: {is_columnar_format}") + _log.debug("Header determined", header=header_line, columnar=is_columnar_format) # Parse only the last table hops = [] @@ -398,7 +426,6 @@ class MikrotikTracerouteTable(MikrotikBase): # Skip empty lines if not line: - _log.debug(f"Line {i}: EMPTY - skipping") continue # Skip the column header lines @@ -408,16 +435,14 @@ class MikrotikTracerouteTable(MikrotikBase): or line.startswith("#") ): in_data_section = True - _log.debug(f"Line {i}: HEADER - entering data section: {repr(line)}") continue # Skip paging prompts if "-- [Q quit|C-z pause]" in line: - _log.debug(f"Line {i}: PAGING PROMPT - breaking: {repr(line)}") break # End of this table if in_data_section and line: - _log.debug(f"Line {i}: PROCESSING DATA LINE: {repr(line)}") + # Process data line try: # Define helper function for RTT parsing def parse_rtt(rtt_str: str) -> t.Optional[float]: @@ -439,7 +464,6 @@ class MikrotikTracerouteTable(MikrotikBase): ): # This is a timeout/continuation hop parts = line.split() - _log.debug(f"Line {i}: Timeout/continuation line, parts: {parts}") if len(parts) >= 2 and parts[0].endswith("%"): ip_address = None @@ -471,15 +495,13 @@ class MikrotikTracerouteTable(MikrotikBase): ) hops.append(hop) current_hop_number += 1 - _log.debug(f"Line {i}: Created timeout hop {hop.hop_number}") continue if is_columnar_format: # New format: "1 10.0.0.41 0% 1 0.5ms 0.5 0.5 0.5 0" parts = line.split() - _log.debug(f"Line {i}: Columnar format, parts: {parts}") if len(parts) < 3: - _log.debug(f"Line {i}: Too few parts ({len(parts)}), skipping") + continue continue hop_number = int(parts[0]) @@ -504,15 +526,14 @@ class MikrotikTracerouteTable(MikrotikBase): best_rtt_str = "timeout" worst_rtt_str = "timeout" else: - _log.debug(f"Line {i}: Doesn't match columnar patterns, skipping") + continue continue else: # Old format: "196.60.8.198 0% 1 17.1ms 17.1 17.1 17.1 0" # We need to deduplicate by taking the LAST occurrence of each IP parts = line.split() - _log.debug(f"Line {i}: Old format, parts: {parts}") if len(parts) < 6: - _log.debug(f"Line {i}: Too few parts ({len(parts)}), skipping") + continue continue ip_address = parts[0] if not parts[0].endswith("%") else None @@ -520,7 +541,9 @@ class MikrotikTracerouteTable(MikrotikBase): # Check for truncated IPv6 addresses if ip_address and (ip_address.endswith("...") or ip_address.endswith("..")): _log.warning( - f"Line {i}: Truncated IP address detected: {ip_address} - setting to None" + "Truncated IP address detected, setting to None", + line=i, + ip=ip_address, ) ip_address = None @@ -548,7 +571,7 @@ class MikrotikTracerouteTable(MikrotikBase): # Convert timing values def parse_rtt(rtt_str: str) -> t.Optional[float]: - if rtt_str in ("timeout", "-", "0ms"): + if rtt_str in ("timeout", "-", "0ms", "*"): return None # Remove 'ms' suffix and convert to float rtt_clean = re.sub(r"ms$", "", rtt_str) @@ -579,19 +602,17 @@ class MikrotikTracerouteTable(MikrotikBase): ) hops.append(hop_obj) - _log.debug( - f"Line {i}: Created hop {final_hop_number}: {ip_address} - {loss_pct}% - {sent_count} sent" - ) except (ValueError, IndexError) as e: - _log.debug(f"Failed to parse line '{line}': {e}") + _log.debug("Failed to parse traceroute data line", line=line, error=str(e)) continue - _log.debug(f"Before deduplication: {len(hops)} hops") + # Snapshot before deduplication + orig_hop_count = len(hops) # For old format, we need to deduplicate by IP and take only final stats if not is_columnar_format and hops: - _log.debug(f"Old format detected - deduplicating {len(hops)} total entries") + _log.debug("Old format detected - deduplicating entries", total_entries=len(hops)) # Group by IP address and take the HIGHEST SENT count (final stats) ip_to_final_hop = {} @@ -610,16 +631,11 @@ class MikrotikTracerouteTable(MikrotikBase): if ip_key not in hop_order: hop_order.append(ip_key) ip_to_max_sent[ip_key] = 0 - _log.debug(f"New IP discovered: {ip_key}") # Keep hop with highest SENT count (most recent/final stats) if hop.sent_count and hop.sent_count >= ip_to_max_sent[ip_key]: ip_to_max_sent[ip_key] = hop.sent_count ip_to_final_hop[ip_key] = hop - _log.debug(f"Updated {ip_key}: SENT={hop.sent_count} (final stats)") - - _log.debug(f"IP order: {hop_order}") - _log.debug(f"Final IP stats: {[(ip, ip_to_max_sent[ip]) for ip in hop_order]}") # Rebuild hops list with final stats and correct hop numbers final_hops = [] @@ -627,26 +643,59 @@ class MikrotikTracerouteTable(MikrotikBase): final_hop = ip_to_final_hop[ip_key] final_hop.hop_number = i # Correct hop numbering final_hops.append(final_hop) - _log.debug( - f"Final hop {i}: {ip_key} - Loss: {final_hop.loss_pct}% - Sent: {final_hop.sent_count}" - ) hops = final_hops - _log.debug(f"Deduplication complete: {len(hops)} unique hops with final stats") - - _log.debug(f"After processing: {len(hops)} final hops") - for hop in hops: _log.debug( - f"Final hop {hop.hop_number}: {hop.ip_address} - {hop.loss_pct}% loss - {hop.sent_count} sent" + "Deduplication complete", + before=orig_hop_count, + after=len(hops), + ) + + # Filter excessive timeout hops ONLY at the end (no more valid hops after) + # Find the last hop with a valid IP address + last_valid_hop_index = -1 + for i, hop in enumerate(hops): + if hop.ip_address is not None and hop.loss_pct < 100: + last_valid_hop_index = i + + filtered_hops = [] + trailing_timeouts = 0 + + for i, hop in enumerate(hops): + if i > last_valid_hop_index and hop.ip_address is None and hop.loss_pct == 100: + # This is a trailing timeout hop (after the last valid hop) + trailing_timeouts += 1 + if trailing_timeouts <= 3: # Only keep first 3 trailing timeouts + filtered_hops.append(hop) + else: + # drop extra trailing timeouts + continue + else: + # This is either a valid hop or a timeout hop with valid hops after it + filtered_hops.append(hop) + + # Renumber the filtered hops + for i, hop in enumerate(filtered_hops, 1): + hop.hop_number = i + + hops = filtered_hops + if last_valid_hop_index >= 0: + _log.debug( + "Filtered trailing timeouts", + last_valid_index=last_valid_hop_index, + trailing_timeouts_removed=max(0, orig_hop_count - len(hops)), ) result = MikrotikTracerouteTable(target=target, source=source, hops=hops) - _log.info(f"Parsed {len(hops)} hops from MikroTik traceroute final table") + _log.info("Parsed traceroute final table", hops=len(hops)) return result def traceroute_result(self): """Convert to TracerouteResult format.""" from hyperglass.models.data.traceroute import TracerouteResult, TracerouteHop + from hyperglass.log import log + + _log = log.bind(parser="MikrotikTracerouteTable") converted_hops = [] for hop in self.hops: @@ -659,32 +708,34 @@ class MikrotikTracerouteTable(MikrotikBase): display_ip = hop.ip_address ip_address = None - converted_hops.append( - TracerouteHop( - hop_number=hop.hop_number, - ip_address=ip_address, # None for truncated IPs - display_ip=display_ip, # Truncated IP for display - hostname=hop.hostname, - rtt1=hop.best_rtt, - rtt2=hop.avg_rtt, - rtt3=hop.worst_rtt, - # MikroTik-specific statistics - loss_pct=hop.loss_pct, - sent_count=hop.sent_count, - last_rtt=hop.last_rtt, - best_rtt=hop.best_rtt, - worst_rtt=hop.worst_rtt, - # BGP enrichment fields will be populated by enrichment plugin - # For truncated IPs, these will remain None/empty - asn=None, - org=None, - prefix=None, - country=None, - rir=None, - allocated=None, - ) + created_hop = TracerouteHop( + hop_number=hop.hop_number, + ip_address=ip_address, # None for truncated IPs + display_ip=display_ip, # Truncated IP for display + hostname=hop.hostname, + # Set RTT values to ensure avg_rtt property returns MikroTik's AVG value + # Since avg_rtt = (rtt1 + rtt2 + rtt3) / 3, we set all to the MikroTik AVG + rtt1=hop.avg_rtt, # Set to AVG so computed average is correct + rtt2=hop.avg_rtt, # Set to AVG so computed average is correct + rtt3=hop.avg_rtt, # Set to AVG so computed average is correct + # MikroTik-specific statistics (preserve original values) + loss_pct=hop.loss_pct, + sent_count=hop.sent_count, + last_rtt=hop.last_rtt, # Preserve LAST value + best_rtt=hop.best_rtt, # Preserve BEST value + worst_rtt=hop.worst_rtt, # Preserve WORST value + # BGP enrichment fields will be populated by enrichment plugin + # For truncated IPs, these will remain None/empty + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, ) + converted_hops.append(created_hop) + return TracerouteResult( target=self.target, source=self.source, diff --git a/hyperglass/plugins/_builtin/__init__.py b/hyperglass/plugins/_builtin/__init__.py index 516ca8d..1b60c73 100644 --- a/hyperglass/plugins/_builtin/__init__.py +++ b/hyperglass/plugins/_builtin/__init__.py @@ -14,6 +14,9 @@ from .traceroute_ip_enrichment import ZTracerouteIpEnrichment from .bgp_route_ip_enrichment import ZBgpRouteIpEnrichment from .trace_route_mikrotik import TraceroutePluginMikrotik from .trace_route_huawei import TraceroutePluginHuawei +from .trace_route_arista import TraceroutePluginArista +from .trace_route_frr import TraceroutePluginFrr +from .trace_route_juniper import TraceroutePluginJuniper __all__ = ( "BGPRoutePluginArista", @@ -28,5 +31,8 @@ __all__ = ( "ZBgpRouteIpEnrichment", "TraceroutePluginMikrotik", "TraceroutePluginHuawei", + "TraceroutePluginArista", + "TraceroutePluginFrr", + "TraceroutePluginJuniper", "RemoveCommand", ) diff --git a/hyperglass/plugins/_builtin/bgp_route_ip_enrichment.py b/hyperglass/plugins/_builtin/bgp_route_ip_enrichment.py index f0782a3..7ed8539 100644 --- a/hyperglass/plugins/_builtin/bgp_route_ip_enrichment.py +++ b/hyperglass/plugins/_builtin/bgp_route_ip_enrichment.py @@ -1,7 +1,6 @@ """IP enrichment for structured BGP route data - show path functionality.""" # Standard Library -import asyncio import typing as t # Third Party @@ -18,7 +17,6 @@ if t.TYPE_CHECKING: class ZBgpRouteIpEnrichment(OutputPlugin): - """Enrich structured BGP route output with IP enrichment for next-hop ASN/organization data.""" _hyperglass_builtin: bool = PrivateAttr(True) platforms: t.Sequence[str] = ( @@ -35,80 +33,11 @@ class ZBgpRouteIpEnrichment(OutputPlugin): directives: t.Sequence[str] = ("bgp_route", "bgp_community") common: bool = True - async def _enrich_async(self, output: BGPRouteTable, enrich_next_hop: bool = True) -> None: - """Async helper to enrich BGP route data.""" - _log = log.bind(plugin=self.__class__.__name__) - - if enrich_next_hop: - try: - # First enrich with next-hop IP information (if enabled) - await output.enrich_with_ip_enrichment() - _log.debug("BGP next-hop IP enrichment completed") - except Exception as e: - _log.error(f"BGP next-hop IP enrichment failed: {e}") - else: - _log.debug("BGP next-hop IP enrichment skipped (disabled in config)") - - try: - # Always enrich AS path ASNs with organization names - await output.enrich_as_path_organizations() - _log.debug("BGP AS path organization enrichment completed") - except Exception as e: - _log.error(f"BGP AS path organization enrichment failed: {e}") - def process(self, *, output: "OutputDataModel", query: "Query") -> "OutputDataModel": - """Enrich structured BGP route data with next-hop IP enrichment information.""" if not isinstance(output, BGPRouteTable): return output _log = log.bind(plugin=self.__class__.__name__) - _log.warning(f"🔍 BGP ROUTE PLUGIN STARTED - Processing {len(output.routes)} BGP routes") - # Check if IP enrichment is enabled in config - enrich_next_hop = True - try: - from hyperglass.state import use_state - - params = use_state("params") - if not params.structured.ip_enrichment.enabled: - _log.debug("IP enrichment disabled in configuration") - return output - - # Check next-hop enrichment setting but don't exit - we still want ASN org enrichment - enrich_next_hop = params.structured.ip_enrichment.enrich_next_hop - if not enrich_next_hop: - _log.debug( - "Next-hop enrichment disabled in configuration - will skip next-hop lookup but continue with ASN organization enrichment" - ) - except Exception as e: - _log.debug(f"Could not check IP enrichment config: {e}") - - # Use the built-in enrichment method from BGPRouteTable - try: - # Run async enrichment in sync context - loop = None - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - # If we're already in an event loop, create a new task - import concurrent.futures - - with concurrent.futures.ThreadPoolExecutor() as executor: - future = executor.submit( - asyncio.run, self._enrich_async(output, enrich_next_hop) - ) - future.result() - else: - loop.run_until_complete(self._enrich_async(output, enrich_next_hop)) - except RuntimeError: - # No event loop, create one - asyncio.run(self._enrich_async(output, enrich_next_hop)) - _log.warning( - f"🔍 BGP ROUTE PLUGIN COMPLETED - ASN organizations: {len(output.asn_organizations)}" - ) - except Exception as e: - _log.error(f"BGP route IP enrichment failed: {e}") - - _log.debug(f"Completed enrichment for BGP routes") return output diff --git a/hyperglass/plugins/_builtin/mikrotik_garbage_output.py b/hyperglass/plugins/_builtin/mikrotik_garbage_output.py index 435cd84..4c67e24 100644 --- a/hyperglass/plugins/_builtin/mikrotik_garbage_output.py +++ b/hyperglass/plugins/_builtin/mikrotik_garbage_output.py @@ -41,78 +41,132 @@ class MikrotikGarbageOutput(OutputPlugin): return "" lines = raw_output.splitlines() - cleaned_lines = [] - found_header = False - data_lines = [] + # Remove command echoes and paging, keep only header markers and data lines + # We'll split the output into discrete tables (each table begins at a header) + tables: t.List[t.List[str]] = [] + current_table: t.List[str] = [] + header_line: t.Optional[str] = None for line in lines: stripped = line.strip() - # Skip empty lines - if not stripped: - continue - - # Skip interactive paging prompts - if "-- [Q quit|C-z pause]" in stripped or "-- [Q quit|D dump|C-z pause]" in stripped: + # Skip empty lines and interactive paging prompts + if not stripped or "-- [Q quit|C-z pause]" in stripped or "-- [Q quit|D dump|C-z pause]" in stripped: continue # Skip command echo lines if "tool traceroute" in stripped: continue - # Look for the header line (ADDRESS LOSS SENT LAST AVG BEST WORST) + # If this is a header line, start a new table if "ADDRESS" in stripped and "LOSS" in stripped and "SENT" in stripped: - if not found_header: - cleaned_lines.append(line) - found_header = True + header_line = line + # If we were collecting a table, push it + if current_table: + tables.append(current_table) + current_table = [] + # Start collecting after header continue - # After finding header, collect all data lines - if found_header and stripped: - data_lines.append(line) + # Collect data lines (will be associated with the most recent header) + if header_line is not None: + current_table.append(line) - # Process data lines to aggregate trailing timeouts - if data_lines: - processed_lines = [] - trailing_timeout_count = 0 + # Push the last collected table if any + if current_table: + tables.append(current_table) - # Work backwards to count trailing timeouts - for i in range(len(data_lines) - 1, -1, -1): - line = data_lines[i] - if ( - "100%" in line.strip() - and "timeout" in line.strip() - and not line.strip().startswith( - ("1", "2", "3", "4", "5", "6", "7", "8", "9", "0") - ) - ): - # This is a timeout line (no IP address at start) - trailing_timeout_count += 1 + # If we didn't find any header/data, return cleaned minimal output + if not tables: + # Fallback to previous behavior: remove prompts and flags + filtered_lines: t.List[str] = [] + in_flags_section = False + for line in lines: + stripped_line = line.strip() + if stripped_line.startswith("@") and stripped_line.endswith("] >"): + continue + if "[Q quit|D dump|C-z pause]" in stripped_line: + continue + if stripped_line.startswith("Flags:"): + in_flags_section = True + continue + if in_flags_section: + if "=" in stripped_line: + in_flags_section = False + else: + continue + filtered_lines.append(line) + return "\n".join(filtered_lines) + + # Aggregate tables by hop index. For each hop position, pick the row with the + # highest SENT count. If SENT ties, prefer non-timeout rows and the later table. + processed_lines: t.List[str] = [] + + # Regex to extract LOSS% and SENT count following it: e.g. '0% 3' + sent_re = re.compile(r"(\d+)%\s+(\d+)\b") + + max_rows = max(len(t) for t in tables) + + for i in range(max_rows): + best_row = None + best_sent = -1 + best_is_timeout = True + best_table_index = -1 + + for ti, table in enumerate(tables): + if i >= len(table): + continue + row = table[i] + m = sent_re.search(row) + if m: + try: + sent = int(m.group(2)) + except Exception: + sent = 0 else: - # Found a non-timeout line, stop counting - break + sent = 0 - # Add non-trailing lines as-is - non_trailing_count = len(data_lines) - trailing_timeout_count - processed_lines.extend(data_lines[:non_trailing_count]) + is_timeout = "timeout" in row.lower() or ("100%" in row and "timeout" in row.lower()) - # Handle trailing timeouts - if trailing_timeout_count > 0: - if trailing_timeout_count <= 3: - # If 3 or fewer trailing timeouts, show them all - processed_lines.extend(data_lines[non_trailing_count:]) - else: - # If more than 3 trailing timeouts, show first 2 and aggregate the rest - processed_lines.extend(data_lines[non_trailing_count : non_trailing_count + 2]) - remaining_timeouts = trailing_timeout_count - 2 - # Add an aggregation line - processed_lines.append( - f" ... ({remaining_timeouts} more timeout hops)" - ) + # Prefer higher SENT, then prefer non-timeout, then later table (higher ti) + pick = False + if sent > best_sent: + pick = True + elif sent == best_sent: + if best_is_timeout and not is_timeout: + pick = True + elif (best_is_timeout == is_timeout) and ti > best_table_index: + pick = True - cleaned_lines.extend(processed_lines) + if pick: + best_row = row + best_sent = sent + best_is_timeout = is_timeout + best_table_index = ti - return "\n".join(cleaned_lines) + if best_row is not None: + processed_lines.append(best_row) + + # Collapse excessive trailing timeouts into an aggregation line + trailing_timeouts = 0 + for line in reversed(processed_lines): + if ("timeout" in line.lower()) or (sent_re.search(line) and sent_re.search(line).group(1) == "100"): + trailing_timeouts += 1 + else: + break + + if trailing_timeouts > 3: + non_trailing = len(processed_lines) - trailing_timeouts + # Keep first 2 of trailing timeouts and aggregate the rest + aggregated = processed_lines[:non_trailing] + processed_lines[non_trailing:non_trailing + 2] + remaining = trailing_timeouts - 2 + aggregated.append(f" ... ({remaining} more timeout hops)") + processed_lines = aggregated + + # Prepend header line if we have one + header_to_use = header_line or "ADDRESS LOSS SENT LAST AVG BEST WORST STD-DEV STATUS" + cleaned = [header_to_use] + processed_lines + return "\n".join(cleaned) def process(self, *, output: OutputType, query: "Query") -> Series[str]: """ @@ -185,5 +239,12 @@ class MikrotikGarbageOutput(OutputPlugin): cleaned_output = "\n".join(filtered_lines) cleaned_outputs.append(cleaned_output) - log.debug(f"MikrotikGarbageOutput cleaned {len(output)} output blocks.") + # Minimal debug logging: log number of cleaned blocks and if any aggregation occurred + if len(output) > 0: + log.debug(f"MikrotikGarbageOutput processed {len(output)} output blocks.") + # If any aggregation line was added, log that event + for cleaned in cleaned_outputs: + if "... (" in cleaned: + log.debug("Aggregated excessive trailing timeout hops in traceroute output.") + break return tuple(cleaned_outputs) diff --git a/hyperglass/plugins/_builtin/trace_route_arista.py b/hyperglass/plugins/_builtin/trace_route_arista.py new file mode 100644 index 0000000..f668415 --- /dev/null +++ b/hyperglass/plugins/_builtin/trace_route_arista.py @@ -0,0 +1,657 @@ +"""Parse Arista traceroute output to structured data.""" + +# Standard Library +import re +import typing as t + +# Third Party +from pydantic import PrivateAttr + +# Project +from hyperglass.log import log +from hyperglass.exceptions.private import ParsingError +from hyperglass.models.data.traceroute import TracerouteResult, TracerouteHop +from hyperglass.state import use_state + +# Local +from .._output import OutputPlugin + +if t.TYPE_CHECKING: + from hyperglass.models.data import OutputDataModel + from hyperglass.models.api.query import Query + from .._output import OutputType + + +def _normalize_output(output: t.Union[str, t.Sequence[str]]) -> t.List[str]: + """Ensure the output is a list of strings.""" + if isinstance(output, str): + return [output] + return list(output) + + +def parse_arista_traceroute( + output: t.Union[str, t.Sequence[str]], target: str, source: str +) -> "OutputDataModel": + """Parse an Arista traceroute text response.""" + result = None + out_list = _normalize_output(output) + + _log = log.bind(plugin=TraceroutePluginArista.__name__) + combined_output = "\n".join(out_list) + + # DEBUG: Log the raw output we're about to parse + _log.debug(f"=== ARISTA TRACEROUTE PLUGIN RAW INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Output pieces: {len(out_list)}") + _log.debug(f"Combined output length: {len(combined_output)}") + _log.debug(f"First 500 chars: {repr(combined_output[:500])}") + _log.debug(f"=== END PLUGIN RAW INPUT ===") + + try: + result = AristaTracerouteTable.parse_text(combined_output, target, source) + except Exception as exc: + _log.error(f"Failed to parse Arista traceroute: {exc}") + raise ParsingError(f"Failed to parse Arista traceroute output: {exc}") from exc + + _log.debug(f"=== FINAL STRUCTURED TRACEROUTE RESULT ===") + _log.debug(f"Successfully parsed {len(result.hops)} traceroute hops") + _log.debug(f"Target: {target}, Source: {source}") + for hop in result.hops: + _log.debug(f"Hop {hop.hop_number}: {hop.ip_address or '*'} - RTT: {hop.rtt1 or 'timeout'}") + _log.debug(f"Raw output length: {len(combined_output)} characters") + _log.debug(f"=== END STRUCTURED RESULT ===") + + return result + + +class AristaTracerouteTable(TracerouteResult): + """Arista traceroute table parser.""" + + @classmethod + def parse_text(cls, text: str, target: str, source: str) -> TracerouteResult: + """Parse Arista traceroute text output into structured data.""" + _log = log.bind(parser="AristaTracerouteTable") + + _log.debug(f"=== RAW ARISTA TRACEROUTE INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Raw text length: {len(text)} characters") + _log.debug(f"Raw text:\n{repr(text)}") + _log.debug(f"=== END RAW INPUT ===") + + hops = [] + lines = text.strip().split("\n") + + _log.debug(f"Split into {len(lines)} lines") + + # Pattern for normal hop: " 1 er03-ter.jhb.as37739.net (102.209.241.6) 0.285 ms 0.177 ms 0.137 ms" + # Also handles IPv6: " 1 2001:43f8:6d0::10:3 (2001:43f8:6d0::10:3) 19.460 ms 19.416 ms 19.353 ms" + hop_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for MPLS hop with labels: " 2 41.78.188.48 (41.78.188.48) 1653.906 ms" + mpls_hop_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)\s+]+>\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(.+?)\s+\(([^)]+)\)\s+]+>\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for complex multipath with mixed timeouts and IPs: + # "10 ae22.cr11-lon2.ip6.gtt.net (2001:668:0:3:ffff:1:0:3471) 201.963 ms be8443.ccr41.lon13.atlas.cogentco.com (2001:550:0:1000::9a36:3859) 184.724 ms *" + complex_multipath_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms(?:\s+\*|\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for partial timeout multipath: " 8 * * 2c0f:fa90:0:8::5 (2c0f:fa90:0:8::5) 179.449 ms" + partial_timeout_pattern = re.compile( + r"^\s*(\d+)\s+\*\s+\*\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms" + ) + + # Pattern for mixed timeout start: " 9 ae22.cr11-lon2.ip6.gtt.net (2001:668:0:3:ffff:1:0:3471) 201.979 ms * 2c0f:fa90:0:8::5 (2c0f:fa90:0:8::5) 179.438 ms" + mixed_timeout_start_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms\s+\*\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms" + ) + + # Pattern for triple multipath IPv6: "30 2001:41d0:0:50::b:66 (2001:41d0:0:50::b:66) 442.036 ms 2402:1f00:8201:586:: (2402:1f00:8201:586::) 456.999 ms 2001:41d0:0:50::b:66 (2001:41d0:0:50::b:66) 441.399 ms" + triple_multipath_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms" + ) + + # Pattern for multiple IPs in one hop (load balancing): + # " 2 po204.asw02.jnb1.tfbnw.net (2620:0:1cff:dead:beef::5316) 0.249 ms 0.234 ms po204.asw04.jnb1.tfbnw.net (2620:0:1cff:dead:beef::5524) 0.244 ms" + multi_hop_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?\s+(.+?)\s+\(([^)]+)\)(?:\s+<[^>]+>)?\s+(\d+(?:\.\d+)?)\s*ms" + ) + + # Pattern for timeout hop: " 6 * * *" + timeout_pattern = re.compile(r"^\s*(\d+)\s+\*\s*\*\s*\*") + + # Pattern for single IP without hostname: "12 72.251.0.8 (72.251.0.8) 421.861 ms 421.788 ms 419.821 ms" + ip_only_pattern = re.compile( + r"^\s*(\d+)\s+([0-9a-fA-F:.]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + for i, line in enumerate(lines): + line = line.strip() + _log.debug(f"Line {i:2d}: {repr(line)}") + + if not line: + continue + + # Skip header lines + if ( + "traceroute to" in line.lower() + or "hops max" in line.lower() + or "byte packets" in line.lower() + ): + _log.debug(f"Line {i:2d}: SKIPPING HEADER") + continue + + # Try to match timeout hop first + timeout_match = timeout_pattern.match(line) + if timeout_match: + hop_number = int(timeout_match.group(1)) + + _log.debug(f"Line {i:2d}: TIMEOUT HOP - {hop_number}: * * *") + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=None, + display_ip=None, + hostname=None, + rtt1=None, + rtt2=None, + rtt3=None, + sent_count=3, # Arista sends 3 pings per hop + last_rtt=None, + best_rtt=None, + worst_rtt=None, + loss_pct=100, # 100% loss for timeout + # BGP enrichment fields (all None for timeout) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match partial timeout: " 8 * * 2c0f:fa90:0:8::5 (2c0f:fa90:0:8::5) 179.449 ms" + partial_timeout_match = partial_timeout_pattern.match(line) + if partial_timeout_match: + hop_number = int(partial_timeout_match.group(1)) + hostname = partial_timeout_match.group(2).strip() + ip_address = partial_timeout_match.group(3) + rtt1 = float(partial_timeout_match.group(4)) + + _log.debug( + f"Line {i:2d}: PARTIAL TIMEOUT - {hop_number}: * * {hostname} ({ip_address}) {rtt1}ms" + ) + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=None, + rtt3=None, + sent_count=3, + last_rtt=rtt1, + best_rtt=rtt1, + worst_rtt=rtt1, + loss_pct=66, # 2 out of 3 packets lost + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match triple multipath IPv6 + triple_multipath_match = triple_multipath_pattern.match(line) + if triple_multipath_match: + hop_number = int(triple_multipath_match.group(1)) + hostname1 = triple_multipath_match.group(2).strip() + ip1 = triple_multipath_match.group(3) + rtt1 = float(triple_multipath_match.group(4)) + hostname2 = triple_multipath_match.group(5).strip() + ip2 = triple_multipath_match.group(6) + rtt2 = float(triple_multipath_match.group(7)) + hostname3 = triple_multipath_match.group(8).strip() + ip3 = triple_multipath_match.group(9) + rtt3 = float(triple_multipath_match.group(10)) + + _log.debug( + f"Line {i:2d}: TRIPLE MULTIPATH - {hop_number}: {hostname1}/{hostname2}/{hostname3}" + ) + + display_hostname = f"{hostname1} / {hostname2} / {hostname3}" + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=3, + last_rtt=rtt3, + best_rtt=min(rtt1, rtt2, rtt3), + worst_rtt=max(rtt1, rtt2, rtt3), + loss_pct=0, # No loss if we got responses + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match complex multipath with mixed timeouts + complex_multipath_match = complex_multipath_pattern.match(line) + if complex_multipath_match: + hop_number = int(complex_multipath_match.group(1)) + hostname1 = complex_multipath_match.group(2).strip() + ip1 = complex_multipath_match.group(3) + rtt1 = float(complex_multipath_match.group(4)) + hostname2 = complex_multipath_match.group(5).strip() + ip2 = complex_multipath_match.group(6) + rtt2 = float(complex_multipath_match.group(7)) + + # Check for third IP or timeout + rtt3 = None + hostname3 = None + has_third = complex_multipath_match.group(8) is not None + if has_third: + hostname3 = complex_multipath_match.group(8).strip() + rtt3 = float(complex_multipath_match.group(10)) + + _log.debug( + f"Line {i:2d}: COMPLEX MULTIPATH - {hop_number}: {hostname1}/{hostname2}{('/' + hostname3) if hostname3 else ''}" + ) + + display_hostname = f"{hostname1} / {hostname2}" + if hostname3: + display_hostname += f" / {hostname3}" + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=int((3 - len(rtts)) / 3 * 100), + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match mixed timeout with start response + mixed_timeout_start_match = mixed_timeout_start_pattern.match(line) + if mixed_timeout_start_match: + hop_number = int(mixed_timeout_start_match.group(1)) + hostname1 = mixed_timeout_start_match.group(2).strip() + ip1 = mixed_timeout_start_match.group(3) + rtt1 = float(mixed_timeout_start_match.group(4)) + hostname2 = mixed_timeout_start_match.group(5).strip() + ip2 = mixed_timeout_start_match.group(6) + rtt2 = float(mixed_timeout_start_match.group(7)) + + _log.debug( + f"Line {i:2d}: MIXED TIMEOUT START - {hop_number}: {hostname1} * {hostname2}" + ) + + display_hostname = f"{hostname1} / * / {hostname2}" + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=None, # Middle packet timed out + rtt3=rtt2, + sent_count=3, + last_rtt=rtt2, + best_rtt=min(rtt1, rtt2), + worst_rtt=max(rtt1, rtt2), + loss_pct=33, # 1 out of 3 packets lost + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match MPLS hop + mpls_hop_match = mpls_hop_pattern.match(line) + if mpls_hop_match: + hop_number = int(mpls_hop_match.group(1)) + hostname1 = mpls_hop_match.group(2).strip() + ip1 = mpls_hop_match.group(3) + rtt1 = float(mpls_hop_match.group(4)) + rtt2 = float(mpls_hop_match.group(5)) if mpls_hop_match.group(5) else None + + # Check for second MPLS hop in same line + hostname2 = None + ip2 = None + rtt3 = None + if mpls_hop_match.group(6): # Second hostname exists + hostname2 = mpls_hop_match.group(6).strip() + ip2 = mpls_hop_match.group(7) + rtt3 = float(mpls_hop_match.group(8)) + + _log.debug( + f"Line {i:2d}: MPLS HOP - {hop_number}: {hostname1} (MPLS){(' + ' + hostname2) if hostname2 else ''}" + ) + + display_hostname = hostname1 + if hostname2: + display_hostname += f" / {hostname2}" + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname if display_hostname != ip1 else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got responses + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match multi-hop line (load balancing) + multi_match = multi_hop_pattern.match(line) + if multi_match: + hop_number = int(multi_match.group(1)) + hostname1 = multi_match.group(2).strip() + ip1 = multi_match.group(3) + rtt1 = float(multi_match.group(4)) + rtt2 = float(multi_match.group(5)) if multi_match.group(5) else None + hostname2 = multi_match.group(6).strip() + ip2 = multi_match.group(7) + rtt3 = float(multi_match.group(8)) + + _log.debug( + f"Line {i:2d}: MULTI HOP - {hop_number}: {hostname1} ({ip1}) and {hostname2} ({ip2})" + ) + + # For multi-hop, we'll create one hop with the first IP and include the second in display + display_hostname = f"{hostname1} / {hostname2}" + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=3, + last_rtt=rtt3 if rtt3 else (rtt2 if rtt2 else rtt1), + best_rtt=min(x for x in [rtt1, rtt2, rtt3] if x is not None), + worst_rtt=max(x for x in [rtt1, rtt2, rtt3] if x is not None), + loss_pct=0, # No loss if we got responses + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match normal hop with hostname + hop_match = hop_pattern.match(line) + if hop_match: + hop_number = int(hop_match.group(1)) + hostname = hop_match.group(2).strip() + ip_address = hop_match.group(3) + rtt1 = float(hop_match.group(4)) + rtt2 = float(hop_match.group(5)) if hop_match.group(5) else None + rtt3 = float(hop_match.group(6)) if hop_match.group(6) else None + + _log.debug( + f"Line {i:2d}: NORMAL HOP - {hop_number}: {hostname} ({ip_address}) RTTs: {rtt1}, {rtt2}, {rtt3}" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got a response + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match IP-only hop (no hostname) + ip_match = ip_only_pattern.match(line) + if ip_match: + hop_number = int(ip_match.group(1)) + ip_display = ip_match.group(2).strip() # The IP shown before parentheses + ip_address = ip_match.group(3) # The IP in parentheses + rtt1 = float(ip_match.group(4)) + rtt2 = float(ip_match.group(5)) if ip_match.group(5) else None + rtt3 = float(ip_match.group(6)) if ip_match.group(6) else None + + _log.debug( + f"Line {i:2d}: IP-ONLY HOP - {hop_number}: {ip_address} RTTs: {rtt1}, {rtt2}, {rtt3}" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=None, # No hostname for IP-only hops + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got a response + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + _log.debug(f"Line {i:2d}: UNMATCHED - skipping") + + _log.debug(f"Before cleanup: {len(hops)} hops") + + # Clean up consecutive timeout hops at the end + # Keep only the first few timeouts, remove excessive trailing timeouts + if len(hops) > 5: + # Find the last non-timeout hop + last_real_hop = -1 + for i in range(len(hops) - 1, -1, -1): + if not hops[i].is_timeout: + last_real_hop = i + break + + if last_real_hop >= 0: + # Keep at most 3 timeout hops after the last real hop + max_timeouts = 3 + timeout_count = 0 + cleaned_hops = hops[: last_real_hop + 1] # Keep all hops up to last real hop + + for hop in hops[last_real_hop + 1 :]: + if hop.is_timeout: + timeout_count += 1 + if timeout_count <= max_timeouts: + cleaned_hops.append(hop) + else: + _log.debug(f"Removing excessive timeout hop {hop.hop_number}") + else: + # If we find another real hop after timeouts, keep it + cleaned_hops.append(hop) + timeout_count = 0 + + hops = cleaned_hops + + _log.debug(f"After cleanup: {len(hops)} hops") + + for hop in hops: + if hop.is_timeout: + _log.debug(f"Final hop {hop.hop_number}: * (timeout)") + else: + _log.debug( + f"Final hop {hop.hop_number}: {hop.ip_address} ({hop.hostname}) - RTTs: {hop.rtt1}/{hop.rtt2}/{hop.rtt3}" + ) + + _log.info(f"Parsed {len(hops)} hops from Arista traceroute") + + # Extract packet size and max hops from header if available + max_hops = 30 # Default from your examples + packet_size = 60 # Default from your examples + + for line in text.split("\n"): + if "hops max" in line and "byte packets" in line: + # Example: "traceroute to 177.72.245.178 (177.72.245.178), 30 hops max, 60 byte packets" + parts = line.split() + for i, part in enumerate(parts): + if part == "hops": + try: + max_hops = int(parts[i - 1]) + except (ValueError, IndexError): + pass + elif part == "byte": + try: + packet_size = int(parts[i - 1]) + except (ValueError, IndexError): + pass + break + + return TracerouteResult( + target=target, + source=source, + hops=hops, + max_hops=max_hops, + packet_size=packet_size, + raw_output=text, + asn_organizations={}, + ) + + +class TraceroutePluginArista(OutputPlugin): + """Parse Arista traceroute output.""" + + _hyperglass_builtin: bool = PrivateAttr(True) + platforms: t.Sequence[str] = ("arista_eos",) + directives: t.Sequence[str] = ("__hyperglass_arista_eos_traceroute__",) + common: bool = False + + def process(self, output: "OutputType", query: "Query") -> "OutputType": + """Process Arista traceroute output.""" + # Extract target and source with fallbacks + target = str(query.query_target) if query.query_target else "unknown" + source = "unknown" + + if hasattr(query, "device") and query.device: + source = getattr(query.device, "display_name", None) or getattr( + query.device, "name", "unknown" + ) + + device = getattr(query, "device", None) + if device is not None: + if not getattr(device, "structured_output", False): + return output + try: + _params = use_state("params") + except Exception: + _params = None + if ( + _params + and getattr(_params, "structured", None) + and getattr(_params.structured, "enable_for_traceroute", None) is False + ): + return output + else: + try: + params = use_state("params") + except Exception: + params = None + if not (params and getattr(params, "structured", None)): + return output + if getattr(params.structured, "enable_for_traceroute", None) is False: + return output + + return parse_arista_traceroute( + output=output, + target=target, + source=source, + ) diff --git a/hyperglass/plugins/_builtin/trace_route_frr.py b/hyperglass/plugins/_builtin/trace_route_frr.py new file mode 100644 index 0000000..fd4ec6e --- /dev/null +++ b/hyperglass/plugins/_builtin/trace_route_frr.py @@ -0,0 +1,552 @@ +"""Parse FRR traceroute output to structured data.""" + +# Standard Library +import re +import typing as t + +# Third Party +from pydantic import PrivateAttr + +# Project +from hyperglass.log import log +from hyperglass.exceptions.private import ParsingError +from hyperglass.models.data.traceroute import TracerouteResult, TracerouteHop +from hyperglass.state import use_state + +# Local +from .._output import OutputPlugin + +if t.TYPE_CHECKING: + from hyperglass.models.data import OutputDataModel + from hyperglass.models.api.query import Query + from .._output import OutputType + + +def _normalize_output(output: t.Union[str, t.Sequence[str]]) -> t.List[str]: + """Ensure the output is a list of strings.""" + if isinstance(output, str): + return [output] + return list(output) + + +def parse_frr_traceroute( + output: t.Union[str, t.Sequence[str]], target: str, source: str +) -> "OutputDataModel": + """Parse an FRR traceroute text response.""" + result = None + out_list = _normalize_output(output) + + _log = log.bind(plugin=TraceroutePluginFrr.__name__) + combined_output = "\n".join(out_list) + + # DEBUG: Log the raw output we're about to parse + _log.debug(f"=== FRR TRACEROUTE PLUGIN RAW INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Output pieces: {len(out_list)}") + _log.debug(f"Combined output length: {len(combined_output)}") + _log.debug(f"First 500 chars: {repr(combined_output[:500])}") + _log.debug(f"=== END PLUGIN RAW INPUT ===") + + try: + result = FrrTracerouteTable.parse_text(combined_output, target, source) + except Exception as exc: + _log.error(f"Failed to parse FRR traceroute: {exc}") + raise ParsingError(f"Failed to parse FRR traceroute output: {exc}") from exc + + _log.debug(f"=== FINAL STRUCTURED TRACEROUTE RESULT ===") + _log.debug(f"Successfully parsed {len(result.hops)} traceroute hops") + _log.debug(f"Target: {target}, Source: {source}") + for hop in result.hops: + _log.debug(f"Hop {hop.hop_number}: {hop.ip_address or '*'} - RTT: {hop.rtt1 or 'timeout'}") + _log.debug(f"Raw output length: {len(combined_output)} characters") + _log.debug(f"=== END STRUCTURED RESULT ===") + + return result + + +class FrrTracerouteTable(TracerouteResult): + """FRR traceroute table parser.""" + + @classmethod + def parse_text(cls, text: str, target: str, source: str) -> TracerouteResult: + """Parse FRR traceroute text output into structured data.""" + _log = log.bind(parser="FrrTracerouteTable") + + _log.debug(f"=== RAW FRR TRACEROUTE INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Raw text length: {len(text)} characters") + _log.debug(f"Raw text:\n{repr(text)}") + _log.debug(f"=== END RAW INPUT ===") + + hops = [] + lines = text.strip().split("\n") + + _log.debug(f"Split into {len(lines)} lines") + + # Pattern for normal hop: " 1 bdr2.std.douala-ix.net (196.49.84.34) 0.520 ms 0.451 ms 0.418 ms" + hop_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for timeout hop: " 3 * * *" + timeout_pattern = re.compile(r"^\s*(\d+)\s+\*\s*\*\s*\*") + + # Pattern for partial timeout: " 7 port-channel4.core4.mrs1.he.net (184.105.81.30) 132.624 ms 132.589 ms *" + partial_timeout_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?\s+\*" + ) + + # Pattern for IP-only hop: "15 72.251.0.8 (72.251.0.8) 360.370 ms 352.170 ms 354.132 ms" + ip_only_pattern = re.compile( + r"^\s*(\d+)\s+([0-9a-fA-F:.]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Complex multi-IP patterns for load balancing scenarios + # Pattern 1: "18 * 2001:41d0:0:50::7:1009 (2001:41d0:0:50::7:1009) 353.548 ms 351.516 ms" + partial_multi_pattern = re.compile( + r"^\s*(\d+)\s+\*\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern 2: "12 2001:41d0:aaaa:100::3 (2001:41d0:aaaa:100::3) 274.418 ms 2001:41d0:aaaa:100::5 (2001:41d0:aaaa:100::5) 269.972 ms 282.653 ms" + dual_ip_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern 3: More complex multi-IP lines (3 or more IPs) + # "19 2001:41d0:0:50::3:211b (2001:41d0:0:50::3:211b) 351.213 ms 2001:41d0:0:50::7:100f (2001:41d0:0:50::7:100f) 351.090 ms 2001:41d0:0:50::7:100b (2001:41d0:0:50::7:100b) 351.282 ms" + multi_ip_pattern = re.compile( + r"^\s*(\d+)\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms\s+(.+?)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms" + ) + + for i, line in enumerate(lines): + line = line.strip() + _log.debug(f"Line {i:2d}: {repr(line)}") + + if not line: + continue + + # Skip header lines + if ( + "traceroute to" in line.lower() + or "hops max" in line.lower() + or "byte packets" in line.lower() + ): + _log.debug(f"Line {i:2d}: SKIPPING HEADER") + continue + + # Try to match timeout hop first + timeout_match = timeout_pattern.match(line) + if timeout_match: + hop_number = int(timeout_match.group(1)) + + _log.debug(f"Line {i:2d}: TIMEOUT HOP - {hop_number}: * * *") + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=None, + display_ip=None, + hostname=None, + rtt1=None, + rtt2=None, + rtt3=None, + sent_count=3, # FRR sends 3 pings per hop + last_rtt=None, + best_rtt=None, + worst_rtt=None, + loss_pct=100, # 100% loss for timeout + # BGP enrichment fields (all None for timeout) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match multi-IP pattern (3 IPs) + multi_match = multi_ip_pattern.match(line) + if multi_match: + hop_number = int(multi_match.group(1)) + hostname1 = multi_match.group(2).strip() + ip1 = multi_match.group(3) + rtt1 = float(multi_match.group(4)) + hostname2 = multi_match.group(5).strip() + ip2 = multi_match.group(6) + rtt2 = float(multi_match.group(7)) + hostname3 = multi_match.group(8).strip() + ip3 = multi_match.group(9) + rtt3 = float(multi_match.group(10)) + + _log.debug(f"Line {i:2d}: MULTI-IP HOP (3 IPs) - {hop_number}: {ip1}, {ip2}, {ip3}") + + # Use the first IP as primary, combine hostnames + display_hostname = f"{hostname1} / {hostname2} / {hostname3}" + if hostname1 == ip1: + display_hostname = None # All IP-only + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=3, + last_rtt=rtt3, + best_rtt=min(rtt1, rtt2, rtt3), + worst_rtt=max(rtt1, rtt2, rtt3), + loss_pct=0, # No loss if we got responses + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match dual-IP pattern + dual_match = dual_ip_pattern.match(line) + if dual_match: + hop_number = int(dual_match.group(1)) + hostname1 = dual_match.group(2).strip() + ip1 = dual_match.group(3) + rtt1 = float(dual_match.group(4)) + hostname2 = dual_match.group(5).strip() + ip2 = dual_match.group(6) + rtt2 = float(dual_match.group(7)) + rtt3 = float(dual_match.group(8)) if dual_match.group(8) else None + + _log.debug(f"Line {i:2d}: DUAL-IP HOP - {hop_number}: {ip1} and {ip2}") + + # Use the first IP as primary, combine hostnames + display_hostname = f"{hostname1} / {hostname2}" + if hostname1 == ip1: + display_hostname = None # Both IP-only + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got responses + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match partial multi pattern (* hostname) + partial_multi_match = partial_multi_pattern.match(line) + if partial_multi_match: + hop_number = int(partial_multi_match.group(1)) + hostname = partial_multi_match.group(2).strip() + ip_address = partial_multi_match.group(3) + rtt1 = float(partial_multi_match.group(4)) + rtt2 = float(partial_multi_match.group(5)) if partial_multi_match.group(5) else None + + _log.debug( + f"Line {i:2d}: PARTIAL-MULTI HOP - {hop_number}: * {hostname} ({ip_address})" + ) + + rtts = [x for x in [rtt1, rtt2] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=None, + sent_count=3, # Still sent 3, but one timed out + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=33.33, # 1 out of 3 packets lost + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match partial timeout (hostname with one *) + partial_timeout_match = partial_timeout_pattern.match(line) + if partial_timeout_match: + hop_number = int(partial_timeout_match.group(1)) + hostname = partial_timeout_match.group(2).strip() + ip_address = partial_timeout_match.group(3) + rtt1 = float(partial_timeout_match.group(4)) + rtt2 = ( + float(partial_timeout_match.group(5)) + if partial_timeout_match.group(5) + else None + ) + + _log.debug( + f"Line {i:2d}: PARTIAL-TIMEOUT HOP - {hop_number}: {hostname} ({ip_address}) with timeout" + ) + + rtts = [x for x in [rtt1, rtt2] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=None, + sent_count=3, # Still sent 3, but one timed out + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=33.33, # 1 out of 3 packets lost + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match normal hop with hostname + hop_match = hop_pattern.match(line) + if hop_match: + hop_number = int(hop_match.group(1)) + hostname = hop_match.group(2).strip() + ip_address = hop_match.group(3) + rtt1 = float(hop_match.group(4)) + rtt2 = float(hop_match.group(5)) if hop_match.group(5) else None + rtt3 = float(hop_match.group(6)) if hop_match.group(6) else None + + _log.debug( + f"Line {i:2d}: NORMAL HOP - {hop_number}: {hostname} ({ip_address}) RTTs: {rtt1}, {rtt2}, {rtt3}" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got a response + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + # Try to match IP-only hop (no hostname) + ip_match = ip_only_pattern.match(line) + if ip_match: + hop_number = int(ip_match.group(1)) + ip_display = ip_match.group(2).strip() # The IP shown before parentheses + ip_address = ip_match.group(3) # The IP in parentheses + rtt1 = float(ip_match.group(4)) + rtt2 = float(ip_match.group(5)) if ip_match.group(5) else None + rtt3 = float(ip_match.group(6)) if ip_match.group(6) else None + + _log.debug( + f"Line {i:2d}: IP-ONLY HOP - {hop_number}: {ip_address} RTTs: {rtt1}, {rtt2}, {rtt3}" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=None, # No hostname for IP-only hops + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got a response + # BGP enrichment fields (will be populated by enrichment plugin) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + continue + + _log.debug(f"Line {i:2d}: UNMATCHED - skipping") + + _log.debug(f"Before cleanup: {len(hops)} hops") + + # Clean up consecutive timeout hops at the end + # Keep only the first few timeouts, remove excessive trailing timeouts + if len(hops) > 5: + # Find the last non-timeout hop + last_real_hop = -1 + for i in range(len(hops) - 1, -1, -1): + if not hops[i].is_timeout: + last_real_hop = i + break + + if last_real_hop >= 0: + # Keep at most 3 timeout hops after the last real hop + max_timeouts = 3 + timeout_count = 0 + cleaned_hops = hops[: last_real_hop + 1] # Keep all hops up to last real hop + + for hop in hops[last_real_hop + 1 :]: + if hop.is_timeout: + timeout_count += 1 + if timeout_count <= max_timeouts: + cleaned_hops.append(hop) + else: + _log.debug(f"Removing excessive timeout hop {hop.hop_number}") + else: + # If we find another real hop after timeouts, keep it + cleaned_hops.append(hop) + timeout_count = 0 + + hops = cleaned_hops + + _log.debug(f"After cleanup: {len(hops)} hops") + + for hop in hops: + if hop.is_timeout: + _log.debug(f"Final hop {hop.hop_number}: * (timeout)") + else: + _log.debug( + f"Final hop {hop.hop_number}: {hop.ip_address} ({hop.hostname or 'no-hostname'}) - RTTs: {hop.rtt1}/{hop.rtt2}/{hop.rtt3}" + ) + + _log.info(f"Parsed {len(hops)} hops from FRR traceroute") + + # Extract packet size and max hops from header if available + max_hops = 30 # Default from your examples + packet_size = 60 # Default from your examples (IPv4) + + for line in text.split("\n"): + if "hops max" in line and "byte packets" in line: + # Example: "traceroute to syd.proof.ovh.net (51.161.209.134), 30 hops max, 60 byte packets" + parts = line.split() + for i, part in enumerate(parts): + if part == "hops": + try: + max_hops = int(parts[i - 1]) + except (ValueError, IndexError): + pass + elif part == "byte": + try: + packet_size = int(parts[i - 1]) + except (ValueError, IndexError): + pass + break + + return TracerouteResult( + target=target, + source=source, + hops=hops, + max_hops=max_hops, + packet_size=packet_size, + raw_output=text, + asn_organizations={}, + ) + + +class TraceroutePluginFrr(OutputPlugin): + """Parse FRR traceroute output.""" + + _hyperglass_builtin: bool = PrivateAttr(True) + platforms: t.Sequence[str] = ("frr",) + directives: t.Sequence[str] = ("__hyperglass_frr_traceroute__",) + common: bool = False + + def process(self, output: "OutputType", query: "Query") -> "OutputType": + """Process FRR traceroute output.""" + # Extract target and source with fallbacks + target = str(query.query_target) if query.query_target else "unknown" + source = "unknown" + + if hasattr(query, "device") and query.device: + source = getattr(query.device, "display_name", None) or getattr( + query.device, "name", "unknown" + ) + + device = getattr(query, "device", None) + if device is not None: + if not getattr(device, "structured_output", False): + return output + try: + _params = use_state("params") + except Exception: + _params = None + if ( + _params + and getattr(_params, "structured", None) + and getattr(_params.structured, "enable_for_traceroute", None) is False + ): + return output + else: + try: + params = use_state("params") + except Exception: + params = None + if not (params and getattr(params, "structured", None)): + return output + if getattr(params.structured, "enable_for_traceroute", None) is False: + return output + + return parse_frr_traceroute( + output=output, + target=target, + source=source, + ) diff --git a/hyperglass/plugins/_builtin/trace_route_huawei.py b/hyperglass/plugins/_builtin/trace_route_huawei.py index bee4c45..549e4eb 100644 --- a/hyperglass/plugins/_builtin/trace_route_huawei.py +++ b/hyperglass/plugins/_builtin/trace_route_huawei.py @@ -11,6 +11,7 @@ from pydantic import PrivateAttr from hyperglass.log import log from hyperglass.exceptions.private import ParsingError from hyperglass.models.data.traceroute import TracerouteResult, TracerouteHop +from hyperglass.state import use_state # Local from .._output import OutputPlugin @@ -246,6 +247,30 @@ class TraceroutePluginHuawei(OutputPlugin): query.device, "name", "unknown" ) + device = getattr(query, "device", None) + if device is not None: + if not getattr(device, "structured_output", False): + return output + try: + _params = use_state("params") + except Exception: + _params = None + if ( + _params + and getattr(_params, "structured", None) + and getattr(_params.structured, "enable_for_traceroute", None) is False + ): + return output + else: + try: + params = use_state("params") + except Exception: + params = None + if not (params and getattr(params, "structured", None)): + return output + if getattr(params.structured, "enable_for_traceroute", None) is False: + return output + return parse_huawei_traceroute( output=output, target=target, diff --git a/hyperglass/plugins/_builtin/trace_route_juniper.py b/hyperglass/plugins/_builtin/trace_route_juniper.py new file mode 100644 index 0000000..9169a6e --- /dev/null +++ b/hyperglass/plugins/_builtin/trace_route_juniper.py @@ -0,0 +1,573 @@ +"""Parse Juniper traceroute output to structured data.""" + +# Standard Library +import re +import typing as t + +# Third Party +from pydantic import PrivateAttr + +# Project +from hyperglass.log import log +from hyperglass.exceptions.private import ParsingError +from hyperglass.models.data.traceroute import TracerouteResult, TracerouteHop +from hyperglass.state import use_state + +# Local +from .._output import OutputPlugin + +if t.TYPE_CHECKING: + from hyperglass.models.data import OutputDataModel + from hyperglass.models.api.query import Query + from .._output import OutputType + + +def _normalize_output(output: t.Union[str, t.Sequence[str]]) -> t.List[str]: + """Ensure the output is a list of strings.""" + if isinstance(output, str): + return [output] + return list(output) + + +def parse_juniper_traceroute( + output: t.Union[str, t.Sequence[str]], target: str, source: str +) -> "OutputDataModel": + """Parse a Juniper traceroute text response.""" + result = None + out_list = _normalize_output(output) + + _log = log.bind(plugin=TraceroutePluginJuniper.__name__) + combined_output = "\n".join(out_list) + + # DEBUG: Log the raw output we're about to parse + _log.debug(f"=== JUNIPER TRACEROUTE PLUGIN RAW INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Output pieces: {len(out_list)}") + _log.debug(f"Combined output length: {len(combined_output)}") + _log.debug(f"First 500 chars: {repr(combined_output[:500])}") + _log.debug(f"=== END PLUGIN RAW INPUT ===") + + try: + result = JuniperTracerouteTable.parse_text(combined_output, target, source) + except Exception as exc: + _log.error(f"Failed to parse Juniper traceroute: {exc}") + raise ParsingError(f"Failed to parse Juniper traceroute output: {exc}") from exc + + _log.debug(f"=== FINAL STRUCTURED TRACEROUTE RESULT ===") + _log.debug(f"Successfully parsed {len(result.hops)} traceroute hops") + _log.debug(f"Target: {target}, Source: {source}") + for hop in result.hops: + _log.debug(f"Hop {hop.hop_number}: {hop.ip_address or '*'} - RTT: {hop.rtt1 or 'timeout'}") + _log.debug(f"Raw output length: {len(combined_output)} characters") + _log.debug(f"=== END STRUCTURED RESULT ===") + + return result + + +class JuniperTracerouteTable(TracerouteResult): + """Juniper traceroute table parser.""" + + @classmethod + def parse_text(cls, text: str, target: str, source: str) -> TracerouteResult: + """Parse Juniper traceroute text output into structured data.""" + _log = log.bind(parser="JuniperTracerouteTable") + + _log.debug(f"=== RAW JUNIPER TRACEROUTE INPUT ===") + _log.debug(f"Target: {target}, Source: {source}") + _log.debug(f"Raw text length: {len(text)} characters") + _log.debug(f"Raw text:\n{repr(text)}") + _log.debug(f"=== END RAW INPUT ===") + + hops = [] + lines = text.strip().split("\n") + + _log.debug(f"Split into {len(lines)} lines") + + # Pattern for normal hop: " 1 102.218.156.197 (102.218.156.197) 0.928 ms 0.968 ms 0.677 ms" + hop_pattern = re.compile( + r"^\s*(\d+)\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for timeout with IP: " 6 * 130.117.15.146 (130.117.15.146) 162.503 ms 162.773 ms" + timeout_with_ip_pattern = re.compile( + r"^\s*(\d+)\s+\*\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for mixed timeout and IP: " 7 80.231.196.36 (80.231.196.36) 328.264 ms 328.938 ms *" + mixed_timeout_pattern = re.compile( + r"^\s*(\d+)\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?\s+\*" + ) + + # Pattern for multipath: " 3 197.157.77.179 (197.157.77.179) 169.860 ms 41.78.188.48 (41.78.188.48) 185.519 ms 1006.603 ms" + multipath_pattern = re.compile( + r"^\s*(\d+)\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for IPv6 multipath: "25 2001:41d0:0:50::7:100b (2001:41d0:0:50::7:100b) 460.762 ms 2001:41d0:0:50::7:1009 (2001:41d0:0:50::7:1009) 464.993 ms 2001:41d0:0:50::7:100f (2001:41d0:0:50::7:100f) 464.366 ms" + ipv6_multipath_pattern = re.compile( + r"^\s*(\d+)\s+([a-fA-F0-9:]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms\s+([a-fA-F0-9:]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms(?:\s+([a-fA-F0-9:]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms)?" + ) + + # Pattern for complete timeout: " 1 * * *" + timeout_pattern = re.compile(r"^\s*(\d+)\s+\*\s*\*\s*\*") + + # Pattern for partial timeout at end: "10 * * 2001:978:3::12e (2001:978:3::12e) 200.936 ms" + partial_timeout_pattern = re.compile( + r"^\s*(\d+)\s+\*\s+\*\s+([^\s]+)\s+\(([^)]+)\)\s+(\d+(?:\.\d+)?)\s*ms" + ) + + i = 0 + while i < len(lines): + line = lines[i].strip() + _log.debug(f"Line {i:2d}: {repr(line)}") + + if not line: + i += 1 + continue + + # Skip header lines + if ( + "traceroute to" in line.lower() + or "traceroute6 to" in line.lower() + or "hops max" in line.lower() + or "byte packets" in line.lower() + ): + _log.debug(f"Line {i:2d}: SKIPPING HEADER") + i += 1 + continue + + # Skip MPLS label lines + if "MPLS Label=" in line: + _log.debug(f"Line {i:2d}: SKIPPING MPLS LABEL") + i += 1 + continue + + # Try to match complete timeout hop first + timeout_match = timeout_pattern.match(line) + if timeout_match: + hop_number = int(timeout_match.group(1)) + + _log.debug(f"Line {i:2d}: TIMEOUT HOP - {hop_number}: * * *") + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=None, + display_ip=None, + hostname=None, + rtt1=None, + rtt2=None, + rtt3=None, + sent_count=3, + last_rtt=None, + best_rtt=None, + worst_rtt=None, + loss_pct=100, # 100% loss for timeout + # BGP enrichment fields (all None for timeout) + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match partial timeout: "10 * * 2001:978:3::12e (2001:978:3::12e) 200.936 ms" + partial_timeout_match = partial_timeout_pattern.match(line) + if partial_timeout_match: + hop_number = int(partial_timeout_match.group(1)) + ip_address = partial_timeout_match.group(3) + hostname = partial_timeout_match.group(2).strip() + rtt1 = float(partial_timeout_match.group(4)) + + _log.debug( + f"Line {i:2d}: PARTIAL TIMEOUT HOP - {hop_number}: * * {hostname} ({ip_address}) {rtt1}ms" + ) + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=None, + rtt3=None, + sent_count=3, + last_rtt=rtt1, + best_rtt=rtt1, + worst_rtt=rtt1, + loss_pct=66, # 2 out of 3 packets lost + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match IPv6 multipath + ipv6_multipath_match = ipv6_multipath_pattern.match(line) + if ipv6_multipath_match: + hop_number = int(ipv6_multipath_match.group(1)) + ip1 = ipv6_multipath_match.group(3) + hostname1 = ipv6_multipath_match.group(2).strip() + rtt1 = float(ipv6_multipath_match.group(4)) + ip2 = ipv6_multipath_match.group(6) + hostname2 = ipv6_multipath_match.group(5).strip() + rtt2 = float(ipv6_multipath_match.group(7)) + + rtt3 = None + if ipv6_multipath_match.group(10): # Third IP/RTT pair + rtt3 = float(ipv6_multipath_match.group(10)) + + _log.debug( + f"Line {i:2d}: IPv6 MULTIPATH HOP - {hop_number}: {hostname1}/{hostname2} ({ip1}/{ip2})" + ) + + display_hostname = f"{hostname1} / {hostname2}" + if ipv6_multipath_match.group(8): # Third hostname + hostname3 = ipv6_multipath_match.group(8).strip() + display_hostname += f" / {hostname3}" + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got responses + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match multipath IPv4 + multipath_match = multipath_pattern.match(line) + if multipath_match: + hop_number = int(multipath_match.group(1)) + hostname1 = multipath_match.group(2).strip() + ip1 = multipath_match.group(3) + rtt1 = float(multipath_match.group(4)) + hostname2 = multipath_match.group(5).strip() + ip2 = multipath_match.group(6) + rtt2 = float(multipath_match.group(7)) + rtt3 = float(multipath_match.group(8)) if multipath_match.group(8) else None + + _log.debug( + f"Line {i:2d}: MULTIPATH HOP - {hop_number}: {hostname1}/{hostname2} ({ip1}/{ip2})" + ) + + display_hostname = f"{hostname1} / {hostname2}" + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip1, + display_ip=None, + hostname=display_hostname, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got responses + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match timeout with IP: " 6 * 130.117.15.146 (130.117.15.146) 162.503 ms 162.773 ms" + timeout_with_ip_match = timeout_with_ip_pattern.match(line) + if timeout_with_ip_match: + hop_number = int(timeout_with_ip_match.group(1)) + hostname = timeout_with_ip_match.group(2).strip() + ip_address = timeout_with_ip_match.group(3) + rtt1 = float(timeout_with_ip_match.group(4)) + rtt2 = ( + float(timeout_with_ip_match.group(5)) + if timeout_with_ip_match.group(5) + else None + ) + rtt3 = ( + float(timeout_with_ip_match.group(6)) + if timeout_with_ip_match.group(6) + else None + ) + + _log.debug( + f"Line {i:2d}: TIMEOUT WITH IP - {hop_number}: * {hostname} ({ip_address})" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + loss_pct = int((3 - len(rtts)) / 3 * 100) if len(rtts) > 0 else 100 + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=3, + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=loss_pct, + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match mixed timeout: " 7 80.231.196.36 (80.231.196.36) 328.264 ms 328.938 ms *" + mixed_timeout_match = mixed_timeout_pattern.match(line) + if mixed_timeout_match: + hop_number = int(mixed_timeout_match.group(1)) + hostname = mixed_timeout_match.group(2).strip() + ip_address = mixed_timeout_match.group(3) + rtt1 = float(mixed_timeout_match.group(4)) + rtt2 = float(mixed_timeout_match.group(5)) if mixed_timeout_match.group(5) else None + + _log.debug( + f"Line {i:2d}: MIXED TIMEOUT - {hop_number}: {hostname} ({ip_address}) with *" + ) + + rtts = [x for x in [rtt1, rtt2] if x is not None] + loss_pct = int((3 - len(rtts)) / 3 * 100) + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=None, + sent_count=3, + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=loss_pct, + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + # Try to match normal hop + hop_match = hop_pattern.match(line) + if hop_match: + hop_number = int(hop_match.group(1)) + hostname = hop_match.group(2).strip() + ip_address = hop_match.group(3) + rtt1 = float(hop_match.group(4)) + rtt2 = float(hop_match.group(5)) if hop_match.group(5) else None + rtt3 = float(hop_match.group(6)) if hop_match.group(6) else None + + _log.debug( + f"Line {i:2d}: NORMAL HOP - {hop_number}: {hostname} ({ip_address}) RTTs: {rtt1}, {rtt2}, {rtt3}" + ) + + rtts = [x for x in [rtt1, rtt2, rtt3] if x is not None] + + hops.append( + TracerouteHop( + hop_number=hop_number, + ip_address=ip_address, + display_ip=None, + hostname=hostname if hostname != ip_address else None, + rtt1=rtt1, + rtt2=rtt2, + rtt3=rtt3, + sent_count=len(rtts), + last_rtt=rtts[-1] if rtts else None, + best_rtt=min(rtts) if rtts else None, + worst_rtt=max(rtts) if rtts else None, + loss_pct=0, # No loss if we got a response + # BGP enrichment fields + asn=None, + org=None, + prefix=None, + country=None, + rir=None, + allocated=None, + ) + ) + i += 1 + continue + + _log.debug(f"Line {i:2d}: UNMATCHED - skipping") + i += 1 + + _log.debug(f"Before cleanup: {len(hops)} hops") + + # Clean up consecutive timeout hops at the end + if len(hops) > 5: + # Find the last non-timeout hop + last_real_hop = -1 + for i in range(len(hops) - 1, -1, -1): + if not hops[i].is_timeout: + last_real_hop = i + break + + if last_real_hop >= 0: + # Keep at most 3 timeout hops after the last real hop + max_timeouts = 3 + timeout_count = 0 + cleaned_hops = hops[: last_real_hop + 1] # Keep all hops up to last real hop + + for hop in hops[last_real_hop + 1 :]: + if hop.is_timeout: + timeout_count += 1 + if timeout_count <= max_timeouts: + cleaned_hops.append(hop) + else: + _log.debug(f"Removing excessive timeout hop {hop.hop_number}") + else: + # If we find another real hop after timeouts, keep it + cleaned_hops.append(hop) + timeout_count = 0 + + hops = cleaned_hops + + _log.debug(f"After cleanup: {len(hops)} hops") + + for hop in hops: + if hop.is_timeout: + _log.debug(f"Final hop {hop.hop_number}: * (timeout)") + else: + _log.debug( + f"Final hop {hop.hop_number}: {hop.ip_address} ({hop.hostname or 'no-hostname'}) - RTTs: {hop.rtt1}/{hop.rtt2}/{hop.rtt3}" + ) + + _log.info(f"Parsed {len(hops)} hops from Juniper traceroute") + + # Extract packet size and max hops from header if available + max_hops = 30 # Default for Juniper + packet_size = 52 # Default from your examples + + for line in text.split("\n"): + if "hops max" in line and "byte packets" in line: + # Example: "traceroute to 51.161.209.134 (51.161.209.134) from 196.201.112.49, 30 hops max, 52 byte packets" + parts = line.split() + for i, part in enumerate(parts): + if part == "hops": + try: + max_hops = int(parts[i - 1]) + except (ValueError, IndexError): + pass + elif part == "byte": + try: + packet_size = int(parts[i - 1]) + except (ValueError, IndexError): + pass + break + + return TracerouteResult( + target=target, + source=source, + hops=hops, + max_hops=max_hops, + packet_size=packet_size, + raw_output=text, + asn_organizations={}, + ) + + +class TraceroutePluginJuniper(OutputPlugin): + """Parse Juniper traceroute output.""" + + _hyperglass_builtin: bool = PrivateAttr(True) + platforms: t.Sequence[str] = ("juniper", "juniper_junos") + directives: t.Sequence[str] = ("__hyperglass_juniper_traceroute__",) + common: bool = False + + def process(self, output: "OutputType", query: "Query") -> "OutputType": + """Process Juniper traceroute output.""" + # Extract target and source with fallbacks + target = str(query.query_target) if query.query_target else "unknown" + source = "unknown" + + if hasattr(query, "device") and query.device: + source = getattr(query.device, "display_name", None) or getattr( + query.device, "name", "unknown" + ) + + device = getattr(query, "device", None) + if device is not None: + if not getattr(device, "structured_output", False): + return output + try: + _params = use_state("params") + except Exception: + _params = None + if ( + _params + and getattr(_params, "structured", None) + and getattr(_params.structured, "enable_for_traceroute", None) is False + ): + return output + else: + try: + params = use_state("params") + except Exception: + params = None + if not (params and getattr(params, "structured", None)): + return output + if getattr(params.structured, "enable_for_traceroute", None) is False: + return output + + return parse_juniper_traceroute( + output=output, + target=target, + source=source, + ) diff --git a/hyperglass/plugins/_builtin/trace_route_mikrotik.py b/hyperglass/plugins/_builtin/trace_route_mikrotik.py index 5d5f63f..d0b93a4 100644 --- a/hyperglass/plugins/_builtin/trace_route_mikrotik.py +++ b/hyperglass/plugins/_builtin/trace_route_mikrotik.py @@ -7,9 +7,10 @@ import typing as t from pydantic import PrivateAttr, ValidationError # Project -from hyperglass.log import log +from hyperglass.log import log, log as _log from hyperglass.exceptions.private import ParsingError from hyperglass.models.parsing.mikrotik import MikrotikTracerouteTable +from hyperglass.state import use_state # Local from .._output import OutputPlugin @@ -26,6 +27,33 @@ def _normalize_output(output: t.Union[str, t.Sequence[str]]) -> t.List[str]: return [output] return list(output) +def _clean_traceroute_only( + output: t.Union[str, t.Sequence[str]], query: "Query" +) -> t.Union[str, t.Tuple[str, ...]]: + """Run only the traceroute-specific cleaner and return same-shaped result. + + This calls the internal _clean_traceroute_output method on the + MikrotikGarbageOutput plugin so the cleaned traceroute text is used + as the 'raw' output exposed to clients. + """ + from .mikrotik_garbage_output import MikrotikGarbageOutput + + out_list = _normalize_output(output) + cleaner = MikrotikGarbageOutput() + + cleaned_list: t.List[str] = [] + for piece in out_list: + try: + cleaned_piece = cleaner._clean_traceroute_output(piece) + except Exception: + # If cleaner fails for any piece, fall back to the original piece + cleaned_piece = piece + cleaned_list.append(cleaned_piece) + + if isinstance(output, str): + return cleaned_list[0] if cleaned_list else "" + return tuple(cleaned_list) + def parse_mikrotik_traceroute( output: t.Union[str, t.Sequence[str]], target: str, source: str @@ -37,21 +65,18 @@ def parse_mikrotik_traceroute( _log = log.bind(plugin=TraceroutePluginMikrotik.__name__) combined_output = "\n".join(out_list) - # DEBUG: Log the raw output we're about to parse - _log.debug(f"=== MIKROTIK TRACEROUTE PLUGIN RAW INPUT ===") - _log.debug(f"Target: {target}, Source: {source}") - _log.debug(f"Output pieces: {len(out_list)}") - for i, piece in enumerate(out_list): - _log.debug(f"Output piece {i}: {repr(piece[:200])}...") # Truncate for readability - _log.debug(f"Combined output length: {len(combined_output)}") - - # Check if this looks like cleaned or raw output + # Minimal summary of the input - avoid dumping full raw output to logs contains_paging = "-- [Q quit|C-z pause]" in combined_output contains_multiple_tables = combined_output.count("ADDRESS") > 1 - _log.debug(f"Contains paging prompts: {contains_paging}") - _log.debug(f"Contains multiple ADDRESS headers: {contains_multiple_tables}") - _log.debug(f"First 500 chars: {repr(combined_output[:500])}") - _log.debug(f"=== END PLUGIN RAW INPUT ===") + _log.debug( + "Received traceroute plugin input", + target=target, + source=source, + pieces=len(out_list), + combined_len=len(combined_output), + contains_paging=contains_paging, + multiple_tables=contains_multiple_tables, + ) try: # Pass the entire combined output to the parser at once @@ -62,20 +87,13 @@ def parse_mikrotik_traceroute( # This is the processed output from MikrotikGarbageOutput plugin, not the original raw router output result.raw_output = combined_output - # DEBUG: Log the final structured result - _log.debug(f"=== FINAL STRUCTURED TRACEROUTE RESULT ===") - _log.debug(f"Successfully parsed {len(validated.hops)} traceroute hops") - _log.debug(f"Target: {result.target}, Source: {result.source}") - for hop in result.hops: - _log.debug( - f"Hop {hop.hop_number}: {hop.ip_address} - Loss: {hop.loss_pct}% - Sent: {hop.sent_count}" - ) - _log.debug(f"AS Path: {result.as_path_summary}") + # Concise structured logging for result _log.debug( - f"Cleaned raw output length: {len(result.raw_output) if result.raw_output else 0} characters" + "Parsed traceroute result", + hops=len(validated.hops), + target=result.target, + source=result.source, ) - _log.debug(f"Copy button will show CLEANED output (after MikrotikGarbageOutput processing)") - _log.debug(f"=== END STRUCTURED RESULT ===") except ValidationError as err: _log.critical(err) @@ -100,7 +118,50 @@ class TraceroutePluginMikrotik(OutputPlugin): target = getattr(query, "target", "unknown") source = getattr(query, "source", "unknown") + # Try to get target from query_target which is more reliable + if hasattr(query, "query_target") and query.query_target: + target = str(query.query_target) + if hasattr(query, "device") and query.device: source = getattr(query.device, "name", source) + + _log = log.bind(plugin=TraceroutePluginMikrotik.__name__) + + # Debug: emit the raw response exactly as returned by the router. + # Do not transform, join, or normalize the output — log it verbatim. + try: + # Ensure the router output is embedded in the log message body so it + # is visible regardless of the logger's formatter configuration. + if isinstance(output, (tuple, list)): + try: + combined_raw = "\n".join(output) + except Exception: + # Fall back to repr if join fails for non-string elements + combined_raw = repr(output) + else: + combined_raw = output if isinstance(output, str) else repr(output) + + # Log the full verbatim router response (DEBUG level). + _log.debug("Router raw output:\n{}", combined_raw) + except Exception: + # Don't let logging interfere with normal processing + _log.exception("Failed to log router raw output") + + try: + params = use_state("params") + except Exception: + params = None + + device = getattr(query, "device", None) + + if device is None: + return _clean_traceroute_only(output, query) + else: + if params is None: + return _clean_traceroute_only(output, query) + if not getattr(params, "structured", None): + return _clean_traceroute_only(output, query) + if getattr(params.structured, "enable_for_traceroute", None) is False: + return _clean_traceroute_only(output, query) return parse_mikrotik_traceroute(output, target, source) diff --git a/hyperglass/plugins/_builtin/traceroute_ip_enrichment.py b/hyperglass/plugins/_builtin/traceroute_ip_enrichment.py index 904e995..33bb36e 100644 --- a/hyperglass/plugins/_builtin/traceroute_ip_enrichment.py +++ b/hyperglass/plugins/_builtin/traceroute_ip_enrichment.py @@ -66,8 +66,14 @@ class ZTracerouteIpEnrichment(OutputPlugin): from hyperglass.state import use_state params = use_state("params") - if not params.structured.ip_enrichment.enabled: - _log.debug("IP enrichment disabled in configuration") + # If structured config missing or traceroute enrichment disabled, skip + # IP enrichment but still perform reverse DNS lookups. + if ( + not getattr(params, "structured", None) + or not params.structured.ip_enrichment.enrich_traceroute + or getattr(params.structured, "enable_for_traceroute", None) is False + ): + _log.debug("IP enrichment for traceroute disabled in configuration") # Still do reverse DNS if enrichment is disabled for hop in output.hops: if hop.ip_address and hop.hostname is None: diff --git a/hyperglass/ui/components/looking-glass-form.tsx b/hyperglass/ui/components/looking-glass-form.tsx index 2da4633..776bfdd 100644 --- a/hyperglass/ui/components/looking-glass-form.tsx +++ b/hyperglass/ui/components/looking-glass-form.tsx @@ -87,7 +87,7 @@ export const LookingGlassForm = (): JSX.Element => { return tmp; }, [form.queryType, form.queryLocation, getDirective]); - function submitHandler(): void { + async function submitHandler(): Promise { if (process.env.NODE_ENV === 'development') { console.table({ 'Query Location': form.queryLocation.toString(), @@ -97,6 +97,11 @@ export const LookingGlassForm = (): JSX.Element => { }); } + // Note: IP enrichment refresh is now handled server-side on query + // submission when enabled. Removing client-side best-effort refresh + // to centralize refresh logic and avoid redundant requests from many + // clients. + // Before submitting a query, make sure the greeting is acknowledged if required. This should // be handled before loading the app, but people be sneaky. if (!greetingReady) { diff --git a/hyperglass/ui/components/output/traceroute-fields.tsx b/hyperglass/ui/components/output/traceroute-fields.tsx index eaa8484..953895f 100644 --- a/hyperglass/ui/components/output/traceroute-fields.tsx +++ b/hyperglass/ui/components/output/traceroute-fields.tsx @@ -55,9 +55,18 @@ export const ASNField = (props: ASNFieldProps): JSX.Element => { ); } - // Display ASN as-is (no prefix added since backend now sends clean format) - const asnDisplay = asn; // Just use the value directly: "12345" or "IXP" - const tooltipLabel = org && org !== 'None' ? `${asnDisplay} - ${org}` : asnDisplay; + // Display ASN. If this hop is an IXP (asn === 'IXP') and we have the + // IXP name in `org`, show the IXP name instead of the literal "IXP" so + // the visualiser renders a friendly label. Keep the tooltip labeled as + // "IXP - " for clarity. + let asnDisplay = asn; // default: "12345" or "IXP" + // For table display we want IXPs to appear as the literal "IXP". + if (asn === 'IXP') { + asnDisplay = 'IXP'; + } + const tooltipLabel = org && org !== 'None' + ? (asn === 'IXP' ? `IXP - ${org}` : `${asnDisplay} - ${org}`) + : asnDisplay; return ( diff --git a/hyperglass/ui/components/path/path.tsx b/hyperglass/ui/components/path/path.tsx index d3073b9..0bd2494 100644 --- a/hyperglass/ui/components/path/path.tsx +++ b/hyperglass/ui/components/path/path.tsx @@ -26,9 +26,52 @@ export const Path = (props: PathProps): JSX.Element => { const output = response?.output as AllStructuredResponses; const bg = useColorValue('light.50', 'dark.900'); const centered = useBreakpointValue({ base: false, lg: true }) ?? true; + const addResponse = useFormState(s => s.addResponse); return ( <> - + { + // When opening the AS path modal, attempt on-demand ASN enrichment + // if the response does not already contain ASN organization data. + try { + onOpen(); + if (!response) return; + const out = response.output as any; + const asnOrgs = out?.asn_organizations || {}; + if (Object.keys(asnOrgs).length > 0) return; + + // Collect unique ASNs from the output depending on type + let asns: string[] = []; + if (out?.routes) { + const all = out.routes.flatMap((r: any) => r.as_path || []); + asns = Array.from(new Set(all.map((a: any) => String(a)))); + } else if (out?.hops) { + const all = out.hops.map((h: any) => h.asn).filter(Boolean); + asns = Array.from(new Set(all.map((a: any) => String(a)))); + } + + if (asns.length === 0) return; + + const resp = await fetch('/api/aspath/enrich', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ as_path: asns }), + }); + if (!resp.ok) return; + const j = await resp.json(); + if (j?.success && j.asn_organizations) { + // Merge ASN orgs into the stored response and update state + out.asn_organizations = { ...(out.asn_organizations || {}), ...j.asn_organizations }; + addResponse(device, { ...response, output: out }); + } + } catch (e) { + // Ignore enrichment failures + // eslint-disable-next-line no-console + console.debug('AS path enrichment failed', e); + onOpen(); + } + }} + /> > { let asPaths: string[][] = []; let asnOrgs: Record = {}; + // For traceroute data we may have IXPs represented as asn === 'IXP' with + // the IXP name stored per-hop in hop.org. Collect per-path org arrays so + // nodes for IXPs can show the proper IXP name instead of the generic + // "IXP" label. + const pathGroupOrgs: Record> = {}; if (isBGPData(data)) { // Handle BGP routes with AS paths @@ -70,20 +75,25 @@ function* buildElements( } else if (isTracerouteData(data)) { // Handle traceroute hops - build AS path from hop ASNs const hopAsns: string[] = []; + const hopOrgs: Array = []; let currentAsn = ''; - + for (const hop of data.hops) { if (hop.asn && hop.asn !== 'None' && hop.asn !== currentAsn) { currentAsn = hop.asn; hopAsns.push(hop.asn); + hopOrgs.push(hop.org ?? undefined); } } - + if (hopAsns.length > 0) { // Remove the base ASN if it's the first hop to avoid duplication - const filteredAsns = hopAsns[0] === base.asn ? hopAsns.slice(1) : hopAsns; + const removeBase = hopAsns[0] === base.asn; + const filteredAsns = removeBase ? hopAsns.slice(1) : hopAsns; + const filteredOrgs = removeBase ? hopOrgs.slice(1) : hopOrgs; if (filteredAsns.length > 0) { asPaths = [filteredAsns]; + pathGroupOrgs[0] = filteredOrgs; } } @@ -182,13 +192,25 @@ function* buildElements( const y = g.node(node).y - NODE_HEIGHT * (idx * 6); // Get each ASN's positions. + // Determine display name for this node. Prefer ASN org mapping, but + // for traceroute IXPs prefer the per-hop IXP name if present. + let nodeName = asnOrgs[asn]?.name || (asn === '0' ? 'Private/Unknown' : `AS${asn}`); + if (asn === 'IXP') { + const ixpName = pathGroupOrgs[groupIdx]?.[idx]; + if (ixpName && ixpName !== 'None') { + nodeName = ixpName; + } else { + nodeName = 'IXP'; + } + } + yield { id: node, type: 'ASNode', position: { x, y }, data: { asn: `${asn}`, - name: asn === 'IXP' ? 'IXP' : asnOrgs[asn]?.name || (asn === '0' ? 'Private/Unknown' : `AS${asn}`), + name: nodeName, hasChildren: idx < endIdx, hasParents: true, },