From 4126656c05bcd4c49c35c8f7a68a9b802961016e Mon Sep 17 00:00:00 2001 From: majorlinux Date: Sat, 2 May 2026 14:58:07 -0400 Subject: [PATCH] wiki: update fail2ban digest + netdata docker health + 3 new articles - fail2ban-digest-mode-fleet: recidive-only email model, sshd now silent, defaults-debian.conf gotcha added - netdata-docker-health-alarm-tuning: 30m/10m config, tuning history table - New: wp-fail2ban-logpath-debian-ubuntu, lora-adapter-gguf-conversion-fails, tailscale-status-json-hostname-localhost-ios - Various article updates and nav index refreshes Co-Authored-By: Claude Opus 4.6 (1M context) --- .../distro-specific/wsl2-backup-powershell.md | 2 +- .../networking/ssh-config-key-management.md | 2 +- .../dns-networking/wake-on-lan-router-ssh.md | 2 +- 02-selfhosting/index.md | 2 +- .../netdata-docker-health-alarm-tuning.md | 30 +++- .../security/clamav-fleet-deployment.md | 2 +- .../security/fail2ban-digest-mode-fleet.md | 36 +++-- .../wp-fail2ban-logpath-debian-ubuntu.md | 151 ++++++++++++++++++ .../services/mastodon-instance-tuning.md | 2 +- .../ansible-check-mode-false-positives.md | 2 +- .../lora-adapter-gguf-conversion-fails.md | 119 ++++++++++++++ 05-troubleshooting/index.md | 4 +- 05-troubleshooting/isp-sni-filtering-caddy.md | 98 +++++++++++- ...cale-status-json-hostname-localhost-ios.md | 116 ++++++++++++++ ...sl-default-shell-breaks-remote-commands.md | 2 +- .../windows-sshd-stops-after-reboot.md | 2 +- .../yt-dlp-fedora-js-challenge.md | 2 +- MajorWiki-Deploy-Status.md | 19 ++- README.md | 2 +- SUMMARY.md | 5 +- index.md | 2 +- 21 files changed, 567 insertions(+), 35 deletions(-) create mode 100644 02-selfhosting/security/wp-fail2ban-logpath-debian-ubuntu.md create mode 100644 05-troubleshooting/gpu-display/lora-adapter-gguf-conversion-fails.md create mode 100644 05-troubleshooting/networking/tailscale-status-json-hostname-localhost-ios.md diff --git a/01-linux/distro-specific/wsl2-backup-powershell.md b/01-linux/distro-specific/wsl2-backup-powershell.md index 1fc7e27..04a4e4c 100644 --- a/01-linux/distro-specific/wsl2-backup-powershell.md +++ b/01-linux/distro-specific/wsl2-backup-powershell.md @@ -10,7 +10,7 @@ tags: - majorrig status: published created: 2026-03-16 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # WSL2 Backup via PowerShell Scheduled Task diff --git a/01-linux/networking/ssh-config-key-management.md b/01-linux/networking/ssh-config-key-management.md index 2bfedbf..86919fa 100644 --- a/01-linux/networking/ssh-config-key-management.md +++ b/01-linux/networking/ssh-config-key-management.md @@ -10,7 +10,7 @@ tags: - remote-access status: published created: 2026-03-08 -updated: 2026-04-22T09:20 +updated: 2026-04-30T05:21 --- # SSH Config and Key Management diff --git a/02-selfhosting/dns-networking/wake-on-lan-router-ssh.md b/02-selfhosting/dns-networking/wake-on-lan-router-ssh.md index 0d3731e..f3a125e 100644 --- a/02-selfhosting/dns-networking/wake-on-lan-router-ssh.md +++ b/02-selfhosting/dns-networking/wake-on-lan-router-ssh.md @@ -7,7 +7,7 @@ tags: - asus - ssh created: 2026-04-19 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # Wake-on-LAN via Router SSH diff --git a/02-selfhosting/index.md b/02-selfhosting/index.md index ca9a4c4..362544e 100644 --- a/02-selfhosting/index.md +++ b/02-selfhosting/index.md @@ -1,6 +1,6 @@ --- created: 2026-04-13T10:15 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # 🏠 Self-Hosting & Homelab diff --git a/02-selfhosting/monitoring/netdata-docker-health-alarm-tuning.md b/02-selfhosting/monitoring/netdata-docker-health-alarm-tuning.md index b5fc1cf..5b6b395 100644 --- a/02-selfhosting/monitoring/netdata-docker-health-alarm-tuning.md +++ b/02-selfhosting/monitoring/netdata-docker-health-alarm-tuning.md @@ -1,11 +1,17 @@ --- -title: "Tuning Netdata Docker Health Alarms to Prevent Update Flapping" +title: Tuning Netdata Docker Health Alarms to Prevent Update Flapping domain: selfhosting category: monitoring -tags: [netdata, docker, nextcloud, alarms, health, monitoring] +tags: + - netdata + - docker + - nextcloud + - alarms + - health + - monitoring status: published created: 2026-03-18 -updated: 2026-03-28 +updated: 2026-05-02T11:04 --- # Tuning Netdata Docker Health Alarms to Prevent Update Flapping @@ -61,9 +67,9 @@ chart labels: container_name=!nextcloud-aio-nextcloud * ### Dedicated Nextcloud AIO Alarm -Added 2026-03-23, updated 2026-03-28. The `nextcloud-aio-nextcloud` container needs a more lenient window than other containers. Its healthcheck (`/healthcheck.sh`) verifies PostgreSQL connectivity (port 5432) and PHP-FPM (port 9000). PHP-FPM takes ~90 seconds to warm up after a normal restart β€” but during nightly AIO update cycles, the full startup (occ upgrade, app updates, migrations) can take 5+ minutes. On 2026-03-27, a startup hung and left the container unhealthy for 20 hours until the next nightly cycle replaced it. +Added 2026-03-23, updated 2026-05-02. The `nextcloud-aio-nextcloud` container needs a more lenient window than other containers. Its healthcheck (`/healthcheck.sh`) verifies PostgreSQL connectivity (port 5432) and PHP-FPM (port 9000). PHP-FPM takes ~90 seconds to warm up after a normal restart β€” but during nightly AIO update cycles, the full startup (occ upgrade, app updates, migrations) can take 5+ minutes. On 2026-03-27, a startup hung and left the container unhealthy for 20 hours until the next nightly cycle replaced it. -The dedicated alarm uses a 10-minute lookup window and 10-minute delay to absorb normal startup, while still catching sustained failures: +The dedicated alarm uses a 30-minute lookup window and 10-minute delay to absorb normal startup and update cycles (~40 minutes total grace), while still catching sustained failures: ```ini # Dedicated alarm for nextcloud-aio-nextcloud β€” lenient window to absorb nightly update cycle @@ -76,15 +82,23 @@ template: docker_nextcloud_unhealthy component: Docker units: status every: 30s - lookup: average -10m of unhealthy + lookup: average -30m of unhealthy chart labels: container_name=nextcloud-aio-nextcloud - warn: $this > 0 + warn: $this >= 1 delay: up 10m down 5m multiplier 1.5 max 30m summary: Nextcloud container health sustained - info: nextcloud-aio-nextcloud has been unhealthy for a sustained period β€” not a transient update blip + info: nextcloud-aio-nextcloud has been continuously unhealthy for 30+ minutes β€” not a transient update blip to: sysadmin ``` +**Tuning history:** + +| Date | Lookup | Delay | Trigger | Notes | +|---|---|---|---|---| +| 2026-03-23 | 35m | 35m | Initial split from general alarm | Absorbed PHP-FPM warm-up | +| 2026-04-29 | 15m | 5m | Backup blip (~6m) never triggered | Tightened after stability | +| 2026-05-02 | 30m | 10m | 15m still too aggressive for update cycles | ~40m total grace; catches real outages | + ## Watchdog Cron: Auto-Restart on Sustained Unhealthy If the Nextcloud container stays unhealthy for more than 1 hour (well past any normal startup window), a cron watchdog on majorlab auto-restarts it and logs the event. This was added 2026-03-28 after an incident where the container sat unhealthy for 20 hours until the next nightly backup cycle replaced it. diff --git a/02-selfhosting/security/clamav-fleet-deployment.md b/02-selfhosting/security/clamav-fleet-deployment.md index 373c75f..b731795 100644 --- a/02-selfhosting/security/clamav-fleet-deployment.md +++ b/02-selfhosting/security/clamav-fleet-deployment.md @@ -11,7 +11,7 @@ tags: - cron status: published created: 2026-04-18 -updated: 2026-04-18T11:13 +updated: 2026-04-30T05:21 --- # ClamAV Fleet Deployment with Ansible diff --git a/02-selfhosting/security/fail2ban-digest-mode-fleet.md b/02-selfhosting/security/fail2ban-digest-mode-fleet.md index 9004784..5499fbc 100644 --- a/02-selfhosting/security/fail2ban-digest-mode-fleet.md +++ b/02-selfhosting/security/fail2ban-digest-mode-fleet.md @@ -1,11 +1,18 @@ --- -title: "Fail2Ban Digest Mode β€” Fleet-Wide Quiet Alerts" +title: Fail2Ban Digest Mode β€” Fleet-Wide Quiet Alerts domain: selfhosting category: security -tags: [fail2ban, security, email, ansible, fleet, cron, digest] +tags: + - fail2ban + - security + - email + - ansible + - fleet + - cron + - digest status: published created: 2026-04-22 -updated: 2026-04-22 +updated: 2026-05-02T14:56 --- # Fail2Ban Digest Mode β€” Fleet-Wide Quiet Alerts @@ -21,11 +28,11 @@ Three tiers replace the firehose: | Tier | Jails | Action | Why | |------|-------|--------|-----| -| **Immediate email** | `sshd`, `recidive` | `action_mwl` | Security-critical β€” someone is actively targeting auth or is a repeat offender | +| **Immediate email** | `recidive` | `action_mwl` | Repeat offenders only β€” someone has been banned multiple times across jails | | **Silent ban** | Everything else | `action_` (default) | Ban happens, firewall rule applied, no email sent | | **Daily digest** | All jails | Cron script at 08:00 UTC | One summary email per host with ban counts across all jails | -This reduces email volume from hundreds per day to ~10 (one digest per host + occasional sshd/recidive alerts). +This reduces email volume from hundreds per day to ~10 (one digest per host + occasional recidive alerts). ## jail.local Configuration @@ -40,18 +47,20 @@ action = %(action_)s This overrides the stock `action_mwl` for all jails. Bans still happen β€” the firewall rule is applied β€” but no email is sent. -### Keep immediate alerts for critical jails +### Keep immediate alerts for recidive only ```ini [sshd] enabled = true -action = %(action_mwl)s +action = %(action_)s [recidive] enabled = true action = %(action_mwl)s ``` +> **Updated 2026-05-02:** sshd was moved to silent (`action_`). Only recidive (repeat offenders) now triggers immediate email. sshd bans are captured in the daily digest. + ### Clean up email subjects with fq-hostname By default, fail2ban uses the system FQDN in email subjects. On Tailscale hosts, this produces ugly subjects like `[Fail2Ban] sshd: banned 1.2.3.4 on MajorToot.tail7f2d9.ts.net`. Override it in `[DEFAULT]`: @@ -91,8 +100,9 @@ The playbook `configure_fail2ban_digest.yml` deploys the full digest model fleet ### What it does 1. Deploys a Python helper script that performs **section-aware editing** of `jail.local` (see gotchas below) -2. Sets `action = %(action_)s` in `[DEFAULT]` -3. Sets `action = %(action_mwl)s` in `[sshd]` and `[recidive]` +2. Sets `action = %(action_)s` in `[DEFAULT]` and `[sshd]` +3. Sets `action = %(action_mwl)s` in `[recidive]` +4. Removes stale `action = %(action_mwl)s` from `defaults-debian.conf` if present 4. Sets `fq-hostname` per host using an override dict 5. Deploys the digest script from a Jinja2 template 6. Creates the cron job via `ansible.builtin.cron` @@ -143,6 +153,14 @@ option 'action' in section 'DEFAULT' already exists The Python editor script handles this by replacing existing keys rather than appending. +### defaults-debian.conf overrides jail.local + +On Debian/Ubuntu, `/etc/fail2ban/jail.d/defaults-debian.conf` is loaded **after** `jail.local`. If it contains `action = %(action_mwl)s`, it silently overrides your silent default β€” every jail sends email on every ban. The Ansible playbook now removes this line automatically. If you see per-ban emails after deploying digest mode, check this file first: + +```bash +grep action /etc/fail2ban/jail.d/defaults-debian.conf +``` + ### fq-hostname scope Setting `fq-hostname` in `[DEFAULT]` affects all action templates that use the `` tag β€” including both immediate emails and the digest subject. This is the desired behavior, but be aware that it overrides the system hostname globally within fail2ban. diff --git a/02-selfhosting/security/wp-fail2ban-logpath-debian-ubuntu.md b/02-selfhosting/security/wp-fail2ban-logpath-debian-ubuntu.md new file mode 100644 index 0000000..dcee73f --- /dev/null +++ b/02-selfhosting/security/wp-fail2ban-logpath-debian-ubuntu.md @@ -0,0 +1,151 @@ +--- +title: "wp-fail2ban Plugin Logpath on Debian/Ubuntu (auth.log, not syslog)" +domain: selfhosting +category: security +tags: [fail2ban, wordpress, wp-fail2ban, debugging, gotcha, debian, ubuntu] +status: published +created: 2026-04-30 +updated: 2026-04-30 +--- +# wp-fail2ban Plugin Logpath on Debian/Ubuntu (auth.log, not syslog) + +## The Problem + +You install the [WP fail2ban](https://wordpress.org/plugins/wp-fail2ban/) WordPress plugin, configure the fleet-standard `wordpress-hard`, `wordpress-soft`, and `wordpress-extra` jails, and… nothing. Weeks pass. `fail2ban-client status wordpress-hard` reports `Total failed: 0, Total banned: 0`. Your site is being attacked, but the jails are dead. + +Meanwhile the `wordpress-login` jail (which reads Apache access logs for `POST /wp-login.php` directly) is happily catching brute-forcers. So the problem isn't fail2ban itself β€” it's specifically the wp-fail2ban-plugin-derived jails. + +## The Cause + +The wp-fail2ban plugin emits events via PHP's `syslog()` call with facility `LOG_AUTH`. On Debian/Ubuntu, rsyslog routes the `auth` facility to **`/var/log/auth.log`**, NOT `/var/log/syslog`. On RHEL/Fedora it's `/var/log/secure`. + +A lot of tutorials, ansible-galaxy roles, and copy-paste config snippets specify: + +```ini +logpath = /var/log/syslog +``` + +That's wrong on Debian/Ubuntu. The events never land there, so the filter regex has nothing to match, so the jail catches zero events forever. Silently. + +## Diagnostic Steps + +If a `wordpress-{hard,soft,extra}` jail shows `Total failed: 0` over a long window despite the plugin being active and the site getting attacked: + +**1. Check what the jail thinks it's watching:** + +```bash +sudo fail2ban-client status wordpress-hard | grep "File list" +``` + +**2. Check where wp-fail2ban events actually land:** + +```bash +sudo grep -c "wordpress(" /var/log/auth.log /var/log/syslog /var/log/secure 2>/dev/null +``` + +You'll see something like: + +``` +/var/log/auth.log:314 +/var/log/syslog:0 +``` + +**3. If the jail's `File list` β‰  the file with events, fix the `logpath`.** + +A real event line on Debian/Ubuntu looks like: + +``` +2026-04-18T23:28:21.027004-04:00 hostname wordpress(example.com)[719989]: XML-RPC authentication failure for someone from 1.2.3.4 +``` + +The `wordpress(domain)[pid]` syslog tag is the giveaway β€” those are wp-fail2ban events. + +## The Fix + +Edit the jail blocks in `/etc/fail2ban/jail.local` (or your Ansible source for the jail) and set: + +```ini +[wordpress-hard] +enabled = true +port = http,https +filter = wordpress-hard +logpath = /var/log/auth.log +maxretry = 1 +findtime = 60 +bantime = 30d +backend = polling + +[wordpress-soft] +enabled = true +port = http,https +filter = wordpress-soft +logpath = /var/log/auth.log +maxretry = 5 +findtime = 60 +bantime = 30d +backend = polling + +[wordpress-extra] +enabled = true +port = http,https +filter = wordpress-extra +logpath = /var/log/auth.log +maxretry = 5 +findtime = 60 +bantime = 30d +backend = polling +``` + +Then: + +```bash +sudo fail2ban-client -t # validate +sudo fail2ban-client reload +sudo fail2ban-client status wordpress-hard | grep "File list" +# should now show /var/log/auth.log +``` + +## Verification + +You can prove the filter regex actually matches your real events without waiting for an attack β€” run `fail2ban-regex` against the rotated log: + +```bash +sudo fail2ban-regex /var/log/auth.log.1 /etc/fail2ban/filter.d/wordpress-hard.conf | grep -E "Failregex:|Lines:" +``` + +Healthy output looks like: + +``` +Failregex: 81 total +Lines: 13008 lines, 0 ignored, 81 matched, 12927 missed +``` + +If you see `Failregex: 0 total`, the filter regex doesn't match what the plugin actually emits β€” which is a different bug (filter version skew vs. plugin version), not the logpath gotcha. Investigate `/etc/fail2ban/filter.d/wordpress-{hard,soft}.conf` against actual event lines. + +> **Note:** On a freshly-fixed jail, counters will sit at `Total failed: 0` for a while β€” the `polling` backend starts at the file's current EOF, so old events aren't retroactively counted. New events from the moment of `reload` onward will accumulate. Allow a few days of normal attack traffic before declaring the fix broken. + +## Distribution Cheat Sheet + +| Distro family | wp-fail2ban events land in | +|---|---| +| Debian / Ubuntu | `/var/log/auth.log` | +| RHEL / CentOS / Fedora | `/var/log/secure` | +| systemd-journal-only systems | `journalctl SYSLOG_FACILITY=4` (use `backend = systemd` + `journalmatch = SYSLOG_FACILITY=4`) | + +If you have a mixed fleet, parameterize the path: + +```yaml +# Ansible vars +wp_fail2ban_log_path: "{{ '/var/log/auth.log' if ansible_os_family == 'Debian' else '/var/log/secure' }}" +``` + +## Why wordpress-login Is Unaffected + +The `wordpress-login` jail is a different beast β€” it reads `/var/log/apache2/access.log` directly and matches `^ -.*"POST /wp-login.php` via the `wordpress-login` filter. No plugin involved, no syslog facility involved. So a host can have `wordpress-login` working perfectly while `wordpress-{hard,soft,extra}` are silently dead. Don't let a healthy `wordpress-login` reassure you that the rest of the wp-fail2ban stack is also fine. + +## Related + +- [[fail2ban-wordpress-login-jail]] β€” the access-log layer that catches WP brute force without any plugin dependency +- [[fail2ban-apache-bad-request-jail]] +- [[fail2ban-apache-php-probe-jail]] +- [[clamav-fleet-deployment]] diff --git a/02-selfhosting/services/mastodon-instance-tuning.md b/02-selfhosting/services/mastodon-instance-tuning.md index 2f8241b..f5459a3 100644 --- a/02-selfhosting/services/mastodon-instance-tuning.md +++ b/02-selfhosting/services/mastodon-instance-tuning.md @@ -10,7 +10,7 @@ tags: - docker status: published created: 2026-04-02 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # Mastodon Instance Tuning diff --git a/05-troubleshooting/ansible-check-mode-false-positives.md b/05-troubleshooting/ansible-check-mode-false-positives.md index f88e251..6796756 100644 --- a/05-troubleshooting/ansible-check-mode-false-positives.md +++ b/05-troubleshooting/ansible-check-mode-false-positives.md @@ -11,7 +11,7 @@ tags: - troubleshooting status: published created: 2026-04-18 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # Ansible Check Mode False Positives in Verify/Assert Tasks diff --git a/05-troubleshooting/gpu-display/lora-adapter-gguf-conversion-fails.md b/05-troubleshooting/gpu-display/lora-adapter-gguf-conversion-fails.md new file mode 100644 index 0000000..2b74b08 --- /dev/null +++ b/05-troubleshooting/gpu-display/lora-adapter-gguf-conversion-fails.md @@ -0,0 +1,119 @@ +--- +title: "LoRA adapter β€” GGUF conversion fails with 'config.json not found'" +domain: troubleshooting +category: gpu-display +tags: [lora, qlora, gguf, llama.cpp, unsloth, fine-tuning, qwen] +status: published +created: 2026-04-30 +updated: 2026-04-30 +--- + +# LoRA adapter β€” GGUF conversion fails with 'config.json not found' + +## Problem + +After a QLoRA fine-tune, you point `llama.cpp/convert_hf_to_gguf.py` at the training output directory and it crashes immediately: + +``` +FileNotFoundError: [Errno 2] No such file or directory: + '/path/to/training-runs//final/config.json' +``` + +The output directory looks fine β€” it contains: + +``` +adapter_config.json +adapter_model.safetensors (~150 MB for a 7B base) +chat_template.jinja +tokenizer_config.json +tokenizer.json +``` + +But no `config.json`, and `adapter_model.safetensors` is 150 MB β€” way smaller than the ~14 GB you'd expect for a full Qwen2.5-7B 16-bit checkpoint. + +## Root cause + +`model.save_pretrained()` after a LoRA/QLoRA train saves **only the adapter weights**, not a merged full-precision model. `convert_hf_to_gguf.py` expects a full HuggingFace model directory β€” it reads `config.json` to identify the architecture. Adapter-only directories don't have one. + +You need to merge the LoRA adapter into the base model first, then point the GGUF converter at the merged dir. + +## Solution + +### Quick fix β€” inline merge step + +Insert this block between training completion and `convert_hf_to_gguf.py`: + +```python +from unsloth import FastLanguageModel + +adapter = "/path/to/training-runs//final" +merged = "/path/to/training-runs//merged" + +model, tok = FastLanguageModel.from_pretrained( + model_name=adapter, + max_seq_length=2048, + load_in_4bit=True, +) +model.save_pretrained_merged(merged, tok, save_method="merged_16bit") +``` + +Then run the GGUF converter against the **merged** dir, not the adapter dir: + +```bash +python3 llama.cpp/convert_hf_to_gguf.py /path/to/training-runs//merged \ + --outfile model-f16.gguf --outtype f16 +``` + +The merged dir will contain `config.json`, `model-00001-of-00004.safetensors` (multiple shards totaling the full base model size), `generation_config.json`, etc. + +### Cleaner fix β€” use a wrapper + +If you do this often, encapsulate it: + +1. Wrapper Python script accepts `--adapter`, `--output`, `--skip-merge`, `--all-quants` +2. Step 1: load adapter via `FastLanguageModel.from_pretrained()`, call `save_pretrained_merged()` +3. Step 2: subprocess `convert_hf_to_gguf.py` on the merged dir +4. Step 3: subprocess `llama-quantize` for each requested quant + +This is what `~/corpus/scripts/convert_gguf.py` does on MajorRig (rewritten 2026-04-09 for the MajorTwin v7b cycle). + +## Why this trips people up + +- Unsloth and PEFT both save adapter-only by default after `trainer.save_model()` or `model.save_pretrained()`. There's no warning that downstream tools expect a merged model. +- The training output **looks** complete β€” there's a `tokenizer.json`, a `chat_template.jinja`, and a non-trivial `.safetensors`. It feels like a checkpoint. +- A pipeline that uses `convert_gguf.py` (with merge) once and then someone reimplements Step 4 inline (skipping the wrapper) will silently lose the merge step. This is what happened in MajorTwin v8c (Apr 30, 2026) β€” see [[majortwin-v8b-plan#Pipeline Bug + Fix (2026-04-30)]]. + +## Verification checklist + +After training, before running the GGUF converter, verify the directory you're pointing at: + +| File | Adapter-only dir | Merged dir | +|---|---|---| +| `adapter_config.json` | βœ… | ❌ | +| `adapter_model.safetensors` | βœ… (~150 MB / 7B) | ❌ | +| `config.json` | ❌ | βœ… | +| `model-*.safetensors` (sharded) | ❌ | βœ… (~14 GB / 7B) | +| `generation_config.json` | ❌ | βœ… | +| `tokenizer.json` | βœ… | βœ… | + +If you see only the left column, you need to merge before converting. + +## Resuming a failed pipeline without re-training + +The adapter is small and self-contained. If your pipeline crashes at the GGUF step, you do NOT need to retrain β€” the LoRA adapter at `/final/` is intact. Write a resume wrapper that runs only: + +1. Merge (`save_pretrained_merged`) +2. F16 conversion (`convert_hf_to_gguf.py`) +3. Quantization (`llama-quantize`) +4. Deploy + +This saves the cost of however many GPU-hours the training took. See `~/corpus/scripts/resume_v8c_step4.sh` on MajorRig for an example. + +## Related + +- [[qwen-14b-oom-3080ti]] β€” base model size choice on a 12GB GPU +- [[majortwin-v8b-plan]] β€” v8c pipeline architecture and resume + +## Maintenance + +- 2026-04-30 β€” Created after MajorTwin v8c pipeline failed Step 4. Root-caused, patched, resumed. diff --git a/05-troubleshooting/index.md b/05-troubleshooting/index.md index 764b5cf..6625575 100644 --- a/05-troubleshooting/index.md +++ b/05-troubleshooting/index.md @@ -1,6 +1,6 @@ --- created: 2026-03-15T06:37 -updated: 2026-04-29T22:45 +updated: 2026-04-30T10:41 --- # πŸ”§ General Troubleshooting @@ -8,12 +8,14 @@ Practical fixes for common Linux, networking, and application problems. ## πŸ–₯️ GPU & AI - [Qwen2.5-14B OOM on RTX 3080 Ti (12GB)](gpu-display/qwen-14b-oom-3080ti.md) +- [LoRA adapter β€” GGUF conversion fails with 'config.json not found'](gpu-display/lora-adapter-gguf-conversion-fails.md) ## 🌐 Networking & Web - [Apache Outage: Fail2ban Self-Ban + Missing iptables Rules](networking/fail2ban-self-ban-apache-outage.md) - [Mail Client Stops Receiving: Fail2ban IMAP Self-Ban](networking/fail2ban-imap-self-ban-mail-client.md) - [firewalld: Mail Ports Wiped After Reload](networking/firewalld-mail-ports-reset.md) - [Tailscale SSH: Unexpected Re-Authentication Prompt](networking/tailscale-ssh-reauth-prompt.md) +- [iOS Tailscale Clients Report HostName="localhost" β€” Breaks /etc/hosts Generators](networking/tailscale-status-json-hostname-localhost-ios.md) - [rsync over Tailscale: Hung in TCP Teardown After Transfer Completes](networking/rsync-tailscale-teardown-stall.md) - [Windows OpenSSH: WSL Default Shell Breaks Remote Commands](networking/windows-openssh-wsl-default-shell-breaks-remote-commands.md) - [Pi-hole AI Blocklist Blocks Claude Desktop (ERR_CONNECTION_REFUSED)](networking/pihole-blocks-claude-desktop.md) diff --git a/05-troubleshooting/isp-sni-filtering-caddy.md b/05-troubleshooting/isp-sni-filtering-caddy.md index 8fcd85c..bd435bf 100644 --- a/05-troubleshooting/isp-sni-filtering-caddy.md +++ b/05-troubleshooting/isp-sni-filtering-caddy.md @@ -1,11 +1,17 @@ --- -title: "ISP SNI Filtering & Caddy Troubleshooting" +title: ISP SNI Filtering & Caddy Troubleshooting domain: troubleshooting category: general -tags: [isp, sni, caddy, tls, dns, cloudflare] +tags: + - isp + - sni + - caddy + - tls + - dns + - cloudflare status: published created: 2026-04-02 -updated: 2026-04-02 +updated: 2026-04-30T13:07 --- # ISP SNI Filtering & Caddy Troubleshooting @@ -29,3 +35,89 @@ notes.majorshouse.com { ``` Once the hostname was changed to one without the "wiki" keyword, the TLS handshake completed successfully. + +--- + +## πŸ” 2026-04-30 Update β€” Stale A Record + Cloudflare Proxy Fix + +The hostname rename held for ~4 weeks. On 2026-04-30 the wiki went down with a TLS handshake failure on `notes.majorshouse.com`. The on-the-spot framing was "ISP filter expanded to include 'notes'" β€” but Cloudflare DNS audit showed a different (and arguably worse) root cause: **the `notes` A record was pointing at `136.54.3.248`, an IP that is not majorlab's current home IP.** Whichever host responds at that address either does not run Caddy or does not know about the `notes.majorshouse.com` SNI, so the TLS handshake was rejected with `internal_error 80`. + +### Re-diagnosis + +```bash +# Cert + Caddy + mkdocs all healthy on majorlab +$ ssh majorlab 'systemctl is-active caddy; ss -tlnp | grep :443' +active +LISTEN 0 4096 *:443 users:(("caddy",pid=1549,fd=7)) + +# Loopback-served TLS works fine β€” cert valid Mar 11 β†’ Jun 9 2026 +$ ssh majorlab 'curl -sS -o /dev/null -w "%{http_code}\n" --resolve notes.majorshouse.com:443:127.0.0.1 https://notes.majorshouse.com/' +200 + +# External TLS handshake gets rejected with internal_error +$ openssl s_client -servername notes.majorshouse.com -connect 136.54.3.248:443 +… SSL alert number 80 (internal_error) … +``` + +### The smoking-gun comparison + +Other `*.majorshouse.com` services worked because they were CNAMEs to the apex, which resolves to majorlab's actual home IP: + +| Subdomain | DNS shape | Final IP | Status | +|---|---|---|---| +| `notes.majorshouse.com` | **A β†’ `136.54.3.248`** (stale) | `136.54.3.248` (wrong host) | ❌ TLS rejected | +| `git.majorshouse.com` | CNAME β†’ `majorshouse.com.` | `136.56.0.55` (majorlab) | βœ… | +| `n8n.majorshouse.com` | CNAME β†’ `majorshouse.com.` | `136.56.0.55` (majorlab) | βœ… | +| `matrix.majorshouse.com` | CNAME β†’ `majorshouse.com.` | `136.56.0.55` (majorlab) | βœ… | + +None of the working subdomains were proxied through Cloudflare (`proxied=false` on all of them); they simply had the right IP. The `notes` A record was the only one pointing somewhere wrong β€” most likely a stale value from a prior ISP / IP change that never got cleaned up. + +### βœ… Fix β€” switch `notes` to a Cloudflare-proxied CNAME + +Rather than just correcting the A record (which would silently break again the next time the home IP changes), the fix is a CNAME to the apex with proxy on. That gives two protections in one move: it always tracks the apex (so home IP changes propagate automatically) and it puts the wiki behind Cloudflare's edge (so any future ISP-side weirdness like the original `wiki` SNI filter is also bypassed). + +```bash +# via Cloudflare API (token from ansible-vault: vault_cloudflare_api_token) +PUT /zones/{ZONE_ID}/dns_records/{NOTES_RECORD_ID} +{ + "type": "CNAME", + "name": "notes.majorshouse.com", + "content": "majorshouse.com", + "ttl": 1, + "proxied": true, + "comment": "switched Aβ†’CNAME proxied to bypass stale IP / ISP SNI filter" +} +``` + +Or via the dashboard: + +1. Cloudflare β†’ `majorshouse.com` zone β†’ DNS β†’ Records +2. Edit the `notes` record: Type `CNAME`, Target `majorshouse.com`, Proxy `Proxied` (orange cloud) +3. Save + +External clients now hit Cloudflare edge IPs (`104.21.x.x` / `172.67.x.x`) which TLS-terminate at the edge and tunnel back to majorlab's apex IP. ACME on majorlab keeps working β€” Cloudflare passes the HTTP-01 challenge through on port 80. Caddy's `notes.majorshouse.com {}` block needs no change. + +Verify (response should show `server: cloudflare` and `via: 1.0 Caddy`): + +```bash +curl -sSI https://notes.majorshouse.com/ +``` + +### Why a Cloudflare-proxied CNAME is the durable shape + +- **Apex follows the home IP automatically.** Update the apex A record once when the ISP changes; every subdomain inherits it without per-record fixes. +- **TLS handshake is offloaded to CF.** Any ISP-level SNI weirdness (the original `wiki` ban; theoretical future bans) becomes irrelevant β€” external clients SNI=`notes.majorshouse.com` to Cloudflare, which the ISP doesn't filter. +- **Free.** Cloudflare's free tier covers proxy + TLS termination. + +### Audit checklist for any home-hosted `*.majorshouse.com` subdomain + +- [ ] DNS record is a **CNAME** to `majorshouse.com.`, not an A record to a literal home IP. +- [ ] Cloudflare proxy (orange cloud, `proxied=true`) enabled on the record β€” at minimum for any subdomain where TLS reachability matters. +- [ ] Caddy entry on majorlab references the public hostname; `reverse_proxy` stays on the localhost port. +- [ ] HTTPS verified from outside the LAN (phone on cellular is sufficient) within the first hour after the change. +- [ ] If an A record is genuinely required (e.g. it must NOT go through CF), document why in the deploy notes for that service. + +### Related + +- [[majwiki-setup-and-pipeline]] β€” full wiki deploy pipeline; the DNS step there should reference this fix +- [[Network-Overview]] β€” fleet IP table diff --git a/05-troubleshooting/networking/tailscale-status-json-hostname-localhost-ios.md b/05-troubleshooting/networking/tailscale-status-json-hostname-localhost-ios.md new file mode 100644 index 0000000..9e0606e --- /dev/null +++ b/05-troubleshooting/networking/tailscale-status-json-hostname-localhost-ios.md @@ -0,0 +1,116 @@ +--- +title: iOS Tailscale Clients Report HostName="localhost" β€” Breaks /etc/hosts Generators +domain: troubleshooting +category: networking +tags: + - tailscale + - ios + - postfix + - etc-hosts + - jq +status: published +created: 2026-04-29 +updated: 2026-04-29 +--- + +# iOS Tailscale Clients Report HostName="localhost" β€” Breaks /etc/hosts Generators + +## Problem + +A homegrown script that builds an `/etc/hosts` block from `tailscale status --json` silently corrupted the file the moment any iOS device joined the tailnet. After the next run, services bound to `localhost` started failing. + +On the affected host (`majordiscord`), Postfix refused to start with: + +``` +postfix: fatal: parameter inet_interfaces: no local interface found for 100.127.114.10 +``` + +`/etc/hosts` looked fine at the top β€” `127.0.0.1 localhost` was still present β€” but inside the Tailscale-managed block: + +``` +# TAILSCALE_START +100.84.42.102 tttpod +100.110.197.17 majortoot +100.95.55.40 localhost <-- WRONG (this is an iPhone) +100.84.165.52 majormail +... +100.127.114.10 localhost <-- WRONG (this is an iPad) +# TAILSCALE_END +``` + +When Postfix resolved `localhost` (because `inet_interfaces = localhost` in `main.cf`), the **last matching entry** in `/etc/hosts` won β€” a Tailscale IP that doesn't exist on this host β€” and the daemon died on bind. + +## Root Cause + +The script used `.HostName` from the Tailscale JSON: + +```bash +tailscale status --json \ + | jq -r '.Peer[] | "\(.TailscaleIPs[0]) \(.HostName)"' \ + >> "$TEMP_HOSTS" +``` + +iOS Tailscale clients (iPhone, iPad) **always report `HostName: "localhost"`** in the JSON. iOS doesn't expose the real device name to apps the way macOS/Linux/Windows do, so the Tailscale client falls back to the literal string `localhost`. + +Inspect it directly: + +```bash +$ tailscale status --json | jq '.Peer[] | select(.OS == "iOS") | {DNSName, HostName, OS}' +{ + "DNSName": "iphone171.tail7f2d9.ts.net.", + "HostName": "localhost", + "OS": "iOS" +} +{ + "DNSName": "ipad166.tail7f2d9.ts.net.", + "HostName": "localhost", + "OS": "iOS" +} +``` + +Every iOS device contributes a line ` localhost` to `/etc/hosts`, hijacking the `localhost` lookup. + +## Fix + +Use `.DNSName` (the unique tailnet DNS name) and take the first dotted component instead of `.HostName`: + +```bash +tailscale status --json \ + | jq -r '.Peer[] | "\(.TailscaleIPs[0]) \(.DNSName | rtrimstr(".") | split(".")[0])"' \ + >> "$TEMP_HOSTS" +``` + +`DNSName` is always set, always unique, and produces clean labels like `iphone171`, `ipad166`, `majorlab`, etc. + +After patching the script and re-running it: + +```bash +$ bash /root/update_tailscale_hosts.sh +$ systemctl restart postfix +$ systemctl is-active postfix +active +``` + +## Why It's Hard to Spot + +- The corruption only triggers when an iOS device is in the tailnet β€” so the script "worked" for months. +- `/etc/hosts` files are commonly skimmed top-down. The bogus `localhost` line is buried in the Tailscale block, well below the legitimate `127.0.0.1 localhost` line, and looks superficially like a normal Tailscale entry. +- Postfix's error message names the IP, not `localhost`, so the connection to `/etc/hosts` isn't obvious. +- `getent hosts localhost` shows the *first* match (`127.0.0.1`), not the one Postfix's resolver actually picks for `inet_interfaces` lookup. + +## Verification Checklist + +If you suspect this on any host using a similar generator script: + +```bash +# Any non-loopback "localhost" entries are bugs +grep -nE '^[0-9]+\..* localhost\s*$' /etc/hosts + +# Look at iOS peers' HostName field +tailscale status --json | jq '.Peer[] | select(.OS == "iOS") | .HostName' +``` + +## Related + +- [[majordiscord]] β€” affected host (incident logged 2026-04-29) +- [[Network Overview]] β€” Tailscale fleet topology diff --git a/05-troubleshooting/networking/windows-openssh-wsl-default-shell-breaks-remote-commands.md b/05-troubleshooting/networking/windows-openssh-wsl-default-shell-breaks-remote-commands.md index cd78463..ec8233a 100644 --- a/05-troubleshooting/networking/windows-openssh-wsl-default-shell-breaks-remote-commands.md +++ b/05-troubleshooting/networking/windows-openssh-wsl-default-shell-breaks-remote-commands.md @@ -11,7 +11,7 @@ tags: - powershell status: published created: 2026-04-03 -updated: 2026-04-22T09:20 +updated: 2026-04-30T05:21 --- # Windows OpenSSH: WSL as Default Shell Breaks Remote Commands diff --git a/05-troubleshooting/networking/windows-sshd-stops-after-reboot.md b/05-troubleshooting/networking/windows-sshd-stops-after-reboot.md index 9f299fb..840c2ac 100644 --- a/05-troubleshooting/networking/windows-sshd-stops-after-reboot.md +++ b/05-troubleshooting/networking/windows-sshd-stops-after-reboot.md @@ -10,7 +10,7 @@ tags: - majorrig status: published created: 2026-04-02 -updated: 2026-04-22T09:20 +updated: 2026-04-30T05:21 --- # Windows OpenSSH Server (sshd) Stops After Reboot diff --git a/05-troubleshooting/yt-dlp-fedora-js-challenge.md b/05-troubleshooting/yt-dlp-fedora-js-challenge.md index 9fdd2e4..fd514b7 100644 --- a/05-troubleshooting/yt-dlp-fedora-js-challenge.md +++ b/05-troubleshooting/yt-dlp-fedora-js-challenge.md @@ -10,7 +10,7 @@ tags: - deno status: published created: 2026-04-02 -updated: 2026-04-22T11:33 +updated: 2026-04-30T05:21 --- # yt-dlp YouTube JS Challenge Fix (Fedora) diff --git a/MajorWiki-Deploy-Status.md b/MajorWiki-Deploy-Status.md index 0f9c2b4..cb61548 100644 --- a/MajorWiki-Deploy-Status.md +++ b/MajorWiki-Deploy-Status.md @@ -2,7 +2,7 @@ title: MajorWiki Deployment Status status: deployed project: MajorTwin -updated: 2026-04-07T10:48 +updated: 2026-04-30T05:30 created: 2026-04-02T16:10 --- @@ -79,6 +79,23 @@ git push Gitea receives the push β†’ fires webhook β†’ majorlab pulls β†’ MkDocs rebuilds β†’ `notes.majorshouse.com` updates automatically. +> [!tip] One-liner wrapper +> On MajorRig, the `~/bin/wiki-commit "msg"` helper runs `git pull --rebase --autostash` β†’ `git add -A` β†’ `git commit` β†’ `git push` in one shot. Sidesteps fast-forward rejections from cowork pushes (e.g. MajorAir pushing in parallel) and the empty-credentials issue with HTTPS. + +## πŸ”’ Pre-Commit Hook (in repo) + +`.githooks/pre-commit` (tracked) blocks any commit that adds or renames a `*.md` article without a corresponding entry in `SUMMARY.md`. Bypass with `git commit --no-verify` if you genuinely need to. + +**Per-clone setup** (one-time, per workstation that uses the repo): + +```bash +cd +git config core.hooksPath .githooks +git config pull.rebase true +``` + +The hooksPath line is required β€” git doesn't run hooks from a tracked directory by default. The `pull.rebase true` makes plain `git pull` always rebase locally, matching the `wiki-commit` wrapper's behavior. + ## πŸ“‹ Wiki Maintenance Protocol Every time a new article is added, the following **MUST** be updated to maintain index integrity: diff --git a/README.md b/README.md index c6100b9..4c0523a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ --- created: 2026-04-06T09:52 -updated: 2026-04-29T22:46 +updated: 2026-04-30T05:21 --- # MajorLinux Tech Wiki β€” Index diff --git a/SUMMARY.md b/SUMMARY.md index 7c20dca..62551e0 100644 --- a/SUMMARY.md +++ b/SUMMARY.md @@ -1,6 +1,6 @@ --- created: 2026-04-02T16:03 -updated: 2026-04-29T22:45 +updated: 2026-04-30T11:24 --- * [Home](index.md) * [Linux & Sysadmin](01-linux/index.md) @@ -43,6 +43,7 @@ updated: 2026-04-29T22:45 * [Fail2ban Custom Jail: Apache 404 Scanner Detection](02-selfhosting/security/fail2ban-apache-404-scanner-jail.md) * [Fail2ban Custom Jail: Apache PHP Webshell Probe Detection](02-selfhosting/security/fail2ban-apache-php-probe-jail.md) * [Fail2ban Custom Jail: WordPress Login Brute Force](02-selfhosting/security/fail2ban-wordpress-login-jail.md) + * [wp-fail2ban Plugin Logpath on Debian/Ubuntu (auth.log not syslog)](02-selfhosting/security/wp-fail2ban-logpath-debian-ubuntu.md) * [SELinux: Fixing Fail2ban grep execmem Denial](02-selfhosting/security/selinux-fail2ban-execmem-fix.md) * [UFW Firewall Management](02-selfhosting/security/ufw-firewall-management.md) * [Firewall Hardening with firewalld on Fedora Fleet](02-selfhosting/security/firewalld-fleet-hardening.md) @@ -77,6 +78,7 @@ updated: 2026-04-29T22:45 * [ISP SNI Filtering with Caddy](05-troubleshooting/isp-sni-filtering-caddy.md) * [Obsidian Vault Recovery β€” Loading Cache Hang](05-troubleshooting/obsidian-cache-hang-recovery.md) * [Qwen2.5-14B OOM on RTX 3080 Ti (12GB)](05-troubleshooting/gpu-display/qwen-14b-oom-3080ti.md) + * [LoRA adapter β€” GGUF conversion fails with 'config.json not found'](05-troubleshooting/gpu-display/lora-adapter-gguf-conversion-fails.md) * [yt-dlp YouTube JS Challenge Fix on Fedora](05-troubleshooting/yt-dlp-fedora-js-challenge.md) * [Gemini CLI Manual Update](05-troubleshooting/gemini-cli-manual-update.md) * [MajorWiki Setup & Publishing Pipeline](05-troubleshooting/majwiki-setup-and-pipeline.md) @@ -90,6 +92,7 @@ updated: 2026-04-29T22:45 * [Ollama Drops Off Tailscale When Mac Sleeps](05-troubleshooting/ollama-macos-sleep-tailscale-disconnect.md) * [Ollama: `ollama run` with Piped Stdin Bypasses Chat Template + SYSTEM Prompt](05-troubleshooting/ollama-chat-template-pipe-stdin-bypass.md) * [rsync over Tailscale: Hung in TCP Teardown After Transfer Completes](05-troubleshooting/networking/rsync-tailscale-teardown-stall.md) + * [iOS Tailscale Clients Report HostName="localhost" β€” Breaks /etc/hosts Generators](05-troubleshooting/networking/tailscale-status-json-hostname-localhost-ios.md) * [macOS: Repeating Alert Tone from Mirrored iPhone Notification](05-troubleshooting/macos-mirrored-notification-alert-loop.md) * [ClamAV CPU Spike: Safe Scheduling with nice/ionice](05-troubleshooting/security/clamscan-cpu-spike-nice-ionice.md) * [Ansible: Vault Password File Not Found](05-troubleshooting/ansible-vault-password-file-missing.md) diff --git a/index.md b/index.md index cc7c37b..7cf0be2 100644 --- a/index.md +++ b/index.md @@ -1,6 +1,6 @@ --- created: 2026-04-06T09:52 -updated: 2026-04-29T22:45 +updated: 2026-04-30T05:21 --- # MajorLinux Tech Wiki β€” Index