Skip to content

Commit

Permalink
Merge pull request #7 from thalesmg/shortids
Browse files Browse the repository at this point in the history
sync latest changes to master
  • Loading branch information
thalesmg authored Mar 3, 2022
2 parents 9f2521e + c4f5024 commit 19df674
Show file tree
Hide file tree
Showing 17 changed files with 367 additions and 79 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
20 changes: 20 additions & 0 deletions fetch-logs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---

- name: define hosts
hosts: localhost
gather_facts: no
roles:
- define_inventory

- name: fetch logs
hosts: emqx
become: yes
become_user: root
tasks:
- name: syslog
fetch:
src: "/var/log/syslog"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}/syslog"
flat: yes

...
29 changes: 15 additions & 14 deletions files/tuning-emqx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ set -eu
# ulimit -n 2097152

# Increase number of incoming connections backlog:
sysctl -w net.core.somaxconn=32768
sysctl -w net.ipv4.tcp_max_syn_backlog=16384
sysctl -w net.core.netdev_max_backlog=16384
# sysctl -w net.core.somaxconn=32768
# sysctl -w net.ipv4.tcp_max_syn_backlog=16384
# sysctl -w net.core.netdev_max_backlog=16384

# local port range
# sysctl -w net.ipv4.ip_local_port_range='1000 65535'
Expand All @@ -28,32 +28,33 @@ set -eu
# sysctl -w net.core.wmem_default=262144
# sysctl -w net.core.rmem_max=16777216
# sysctl -w net.core.wmem_max=16777216
sysctl -w net.core.optmem_max=16777216
# sysctl -w net.core.optmem_max=16777216

# sysctl -w net.ipv4.tcp_mem='16777216 16777216 16777216'
sysctl -w net.ipv4.tcp_rmem='1024 4096 16777216'
sysctl -w net.ipv4.tcp_wmem='1024 4096 16777216'
# sysctl -w net.ipv4.tcp_rmem='1024 4096 16777216'
# sysctl -w net.ipv4.tcp_wmem='1024 4096 16777216'

# TMG FIXME: Outdated or module not loaded; ignore
# ... but trying again after bumping into a limit.
# TCP connection tracking:
# sysctl -w net.nf_conntrack_max=1000000
sysctl -w net.nf_conntrack_max=10000000
# sysctl -w net.netfilter.nf_conntrack_max=1000000
# sysctl -w net.netfilter.nf_conntrack_tcp_timeout_time_wait=30

# TIME-WAIT Bucket Pool, Recycling and Reuse:
sysctl -w net.ipv4.tcp_max_tw_buckets=1048576
# sysctl -w net.ipv4.tcp_max_tw_buckets=1048576

# Enabling following option is not recommended. It could cause connection reset under NAT
# sysctl -w net.ipv4.tcp_tw_recycle=1
# sysctl -w net.ipv4.tcp_tw_reuse=1

# Timeout for FIN-WAIT-2 Sockets:
sysctl -w net.ipv4.tcp_fin_timeout=15
# sysctl -w net.ipv4.tcp_fin_timeout=15

# William's config
sysctl -w net.core.rmem_default=262144000
sysctl -w net.core.wmem_default=262144000
sysctl -w net.core.rmem_max=262144000
sysctl -w net.core.wmem_max=262144000
sysctl -w net.ipv4.tcp_mem="378150000 504200000 756300000"
# sysctl -w net.core.rmem_default=262144000
# sysctl -w net.core.wmem_default=262144000
# sysctl -w net.core.rmem_max=262144000
# sysctl -w net.core.wmem_max=262144000
# sysctl -w net.ipv4.tcp_mem="378150000 504200000 756300000"
}
31 changes: 31 additions & 0 deletions full-restart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---

- name: define hosts
hosts: localhost
gather_facts: no
roles:
- define_inventory

- name: stop emqx and clean logs
hosts: emqx
become: yes
become_user: root
roles:
- stop_emqx
- clean_logs

- name: start cores
hosts: cores
become: yes
become_user: root
roles:
- start_emqx

- name: start replicants
hosts: replicants
become: yes
become_user: root
roles:
- start_emqx

...
31 changes: 22 additions & 9 deletions make-emqx-inv
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
#!/usr/bin/env bb
;; -*- mode: clojure; -*-

(let [[bastion-ip num-emqx] *command-line-args*
emqxs (map #(str "emqx-" % ".int.thalesmg") (range (Integer/parseInt num-emqx)))]
(doseq [emqx emqxs]
(println (str "ssh"
" -o StrictHostKeyChecking=no"
" -J ec2-user@"
bastion-ip
" ubuntu@"
emqx))))
(let [[bastion-ip num-emqx num-cores] *command-line-args*
emqxs (map #(str "emqx-" % ".int.thalesmg") (range (Integer/parseInt num-emqx)))
make-line (fn [emqx]
(str "ssh"
" -o StrictHostKeyChecking=no"
" -J ec2-user@"
bastion-ip
" ubuntu@"
emqx))
cores (->> emqxs
(take (Integer/parseInt num-cores))
(map make-line)
(str/join "\n"))
replicants (->> emqxs
(drop (Integer/parseInt num-cores))
(map make-line)
(str/join "\n"))
all (str/join "\n" [cores replicants])]
(spit "par-hosts.txt" all)
(spit "cores.txt" cores)
(spit "replicants.txt" replicants)
(println all))
12 changes: 12 additions & 0 deletions roles/clean_logs/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- name: clear mnesia dir
shell: "rm -rf /var/lib/emqx/mnesia/*"

- name: clear emqx logs
shell: "rm -rf /var/log/emqx/*"

- name: clear syslog
shell: "echo > /var/log/syslog"

- name: clear mem ets dumps
shell: "rm /tmp/*_mem-ets-dump.txt"
ignore_errors: yes
54 changes: 54 additions & 0 deletions roles/collect_logs/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
---

- name: produce node_dump
shell: "/usr/lib/emqx/bin/node_dump"
register: node_dump_invoke
ignore_errors: yes
- name: find node dump location
when: not node_dump_invoke.failed
set_fact:
node_dump_path: "{{ node_dump_invoke.stdout | regex_search(regexp, '\\1') }}"
vars:
regexp: "Created a node dump (.+)"
- name: fetch node dump
when: not node_dump_invoke.failed and node_dump_path is defined and node_dump_path
fetch:
src: "{{ node_dump_path[0] }}"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}-node_dump.tar.gz"
flat: yes
- name: crashdump?
stat:
path: "/var/log/emqx/erl_crash.dump"
register: crashdump_stat
- name: syslog?
stat:
path: "/var/log/syslog"
register: syslog_stat
- name: fetch crashdump
when: crashdump_stat.stat.exists
fetch:
src: "{{ crashdump_stat.stat.path }}"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}-crashdump"
flat: yes
- name: fetch syslog
when: syslog_stat.stat.exists
fetch:
src: "{{ syslog_stat.stat.path }}"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}-syslog"
flat: yes
- name: find mem ets dumps
shell: "ls /tmp/*_mem-ets-dump.txt 2>/dev/null || true"
register: mem_ets_dumps
- name: tar mem ets dumps
when: mem_ets_dumps.stdout_lines
shell: |
cd /tmp
tar -cjf {{ emqx_script_result_file }}.mem-ets-dump.tar.bz2 *_mem-ets-dump.txt
- name: fetch mem ets dumps
when: mem_ets_dumps.stdout_lines
fetch:
src: "/tmp/{{ emqx_script_result_file }}.mem-ets-dump.tar.bz2"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}-mem-ets-dump.tar.bz2"
flat: yes

...
19 changes: 19 additions & 0 deletions roles/move_results/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---

- name: results?
stat:
path: "./tmp/data/{{ emqx_script_result_file }}"
register: results_stat
- block:
- name: copy to destination
copy:
remote_src: yes
src: "{{ results_stat.stat.path }}"
dest: "{{ emqx_script_result_dest }}/"
- name: remove original
file:
path: "{{ results_stat.stat.path }}"
state: absent
when: results_stat.stat.exists and results_stat.stat.isdir and emqx_script_result_dest is defined and emqx_script_result_dest

...
2 changes: 2 additions & 0 deletions roles/start_emqx/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- name: start emqx
shell: emqx start
19 changes: 19 additions & 0 deletions roles/stop_emqx/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
- name: stop systemd emqx
systemd:
name: emqx
state: stopped
daemon_reload: yes

- name: stop emqx
block:
- name: stop emqx
shell: env EMQX_WAIT_FOR_STOP=20 emqx stop
register: stop_result
retries: 1
delay: 3
until: '"badrpc,timeout" not in stop_result.stderr'
failed_when: '"badrpc,timeout" in stop_result.stderr'
rescue:
- name: REALLY stop emqx
shell: "pkill -9 beam.smp"
ignore_errors: yes
51 changes: 33 additions & 18 deletions run-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- emqx_bastion_server
- emqx_test_script_file
- emqtt_bench_interval
- emqtt_bench_pub_interval
- emqtt_bench_number_of_connections
- emqtt_bench_session_expiry_interval
- script_timezone
Expand Down Expand Up @@ -61,8 +62,8 @@
timeout: "{{ script_timeout_s | default('300') }}"
num_procs: "{{ emqx_test_procs | default(1) | int }}"
start_n: "{{ (loadgen_num * (emqx_test_procs | default(1) | int)) | int }}"
- debug:
var: test_runner
# - debug:
# var: test_runner
- name: give the scripts some time...
pause:
seconds: "{{ script_timeout_s | default('300') }}"
Expand All @@ -73,23 +74,37 @@
shell: "ps -ef | grep bench | grep -v grep | awk '{print $2}' | xargs kill -9"
# shell: "pkill beam"
ignore_errors: yes
- name: fetch output
tags: [collect]
fetch:
src: "/tmp/{{ emqx_script_result_file }}"
dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}"
flat: yes
# - name: fetch output
# tags: [collect]
# fetch:
# src: "/tmp/{{ emqx_script_result_file }}"
# dest: "./tmp/data/{{ emqx_script_result_file }}/{{ inventory_hostname_short }}"
# flat: yes

- name: collect logs
hosts: emqx
become: yes
become_user: root
tags: [collect_logs]
roles:
- collect_logs

- name: test
- name: move results
hosts: localhost
tags: [collect]
tasks:
- name: compress
archive:
path:
- './tmp/data/{{ emqx_script_result_file }}/*'
dest: "./tmp/data/{{ emqx_script_result_file }}.tar.bz2"
remove: yes
format: bz2
tags: [collect_logs]
roles:
- move_results

# - name: test
# hosts: localhost
# tags: [collect]
# tasks:
# - name: compress
# archive:
# path:
# - './tmp/data/{{ emqx_script_result_file }}/*'
# dest: "./tmp/data/{{ emqx_script_result_file }}.tar.bz2"
# remove: yes
# format: bz2

...
20 changes: 15 additions & 5 deletions scripts/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,17 @@ def fetch_mem_ets_dump(c : Connection, outdir : Path, prefix : str):
for med in c.run("ls /tmp/", hide=True).stdout.splitlines()
if med.endswith("_mem-ets-dump.txt")
]
for dump in dumps:
infile = f"/tmp/{dump}"
outfile = f"{prefix}.{c.host}.{dump}"
if dumps:
outfile = f"{prefix}.{c.host}.mem-ets-dump.tar.bz2"
tar_dump = f"/tmp/{outfile}"
c.run(f"cd /tmp && tar -cjf {tar_dump} *_mem-ets-dump.txt")
outfilepath = outdir.joinpath(outfile)
c.get(infile, local=str(outfilepath))
c.get(tar_dump, local=str(outfilepath))
# for dump in dumps:
# infile = f"/tmp/{dump}"
# outfile = f"{prefix}.{c.host}.{dump}"
# outfilepath = outdir.joinpath(outfile)
# c.get(infile, local=str(outfilepath))


def fetch_logs(args):
Expand All @@ -108,8 +114,12 @@ def fetch_logs(args):
prefix = args.prefix

for c in inventory_emqx(num_emqx, bastion_ip, cluster_name):
print(f"dumping {c}")
fetch_syslog(c, outdir, prefix)
fetch_node_dump(c, outdir, prefix)
try:
fetch_node_dump(c, outdir, prefix)
except Exception as e:
print(e)
fetch_crashdump(c, outdir, prefix)
fetch_mem_ets_dump(c, outdir, prefix)

Expand Down
Loading

0 comments on commit 19df674

Please sign in to comment.