监控策略

Nightingale因为内置了服务树这种机器分组机制,和Open-Falcon相比,告警灵活性是一个质的提升

Nightingale的告警策略与Open-Falcon的配置有很大区别。首先取消了策略模板的机制,每一条策略都可以单独配置告警接收人,其次,策略可以直接绑定到服务树节点上,节点下的所有机器都会继承生效,另外还增加了一些字段,下面挨个字段解释:

  • 策略名称:描述这条策略的作用,比如“CPU利用率超过85%”
  • 生效节点:关联的服务树节点,节点下所有机器都会应用这条策略
  • 排除节点:生效节点下面的部分子节点可能较为特殊需要排除,可以用此配置解决
  • 报警级别:分三级,P1最严重,报警之后事件通过所有报警通道推送,P3不严重,只用部分通道
  • 统计周期:判断报警的时候使用最近多长时间以内的数据
  • 触发条件:支持与条件,即两个条件都满足才报警
  • Tag过滤:可以配置只生效监控指标的部分tag,或者排除部分tag,比如disk.io.util只监控sda
  • 执行动作:配置报警收敛策略和报警接收人,也支持配置回调,与自动化逻辑打通
  • 留观时长:告警恢复后持续观察多少秒,称为留观时长,未再触发阈值才发送恢复通知
  • 静默恢复:即只发送告警消息,不发送恢复通知,默认会发送,即不开启静默恢复
  • 生效时间:即策略生效时间,默认7*24生效,可以配置只生效部分时间段

策略配置页面支持导入,这里整理了一些常见策略,可以一键导入,然后批量修改一下报警接收人就可以用起来了 :-)

  1. [
  2. {
  3. "name": "timewait状态tcp连接超过2万",
  4. "category": 1,
  5. "alert_dur": 60,
  6. "recovery_dur": 0,
  7. "recovery_notify": 1,
  8. "enable_stime": "00:00",
  9. "enable_etime": "23:59",
  10. "priority": 3,
  11. "runbook": "",
  12. "nids": null,
  13. "exprs": [
  14. {
  15. "eopt": ">",
  16. "func": "all",
  17. "metric": "net.sockets.tcp.timewait",
  18. "params": [],
  19. "threshold": 20000
  20. }
  21. ],
  22. "tags": [],
  23. "enable_days_of_week": [
  24. 0,
  25. 1,
  26. 2,
  27. 3,
  28. 4,
  29. 5,
  30. 6
  31. ],
  32. "converge": [
  33. 36000,
  34. 1
  35. ],
  36. "endpoints": null,
  37. "judge_instance": "",
  38. "work_groups": null
  39. },
  40. {
  41. "name": "内存利用率大于75%",
  42. "category": 1,
  43. "alert_dur": 60,
  44. "recovery_dur": 0,
  45. "recovery_notify": 1,
  46. "enable_stime": "00:00",
  47. "enable_etime": "23:59",
  48. "priority": 2,
  49. "runbook": "",
  50. "nids": null,
  51. "exprs": [
  52. {
  53. "eopt": ">",
  54. "func": "all",
  55. "metric": "mem.bytes.used.percent",
  56. "params": [],
  57. "threshold": 75
  58. }
  59. ],
  60. "tags": [],
  61. "enable_days_of_week": [
  62. 0,
  63. 1,
  64. 2,
  65. 3,
  66. 4,
  67. 5,
  68. 6
  69. ],
  70. "converge": [
  71. 36000,
  72. 1
  73. ],
  74. "endpoints": null,
  75. "judge_instance": "",
  76. "work_groups": null
  77. },
  78. {
  79. "name": "机器loadavg大于16",
  80. "category": 1,
  81. "alert_dur": 60,
  82. "recovery_dur": 0,
  83. "recovery_notify": 1,
  84. "enable_stime": "00:00",
  85. "enable_etime": "23:59",
  86. "priority": 2,
  87. "runbook": "",
  88. "nids": null,
  89. "exprs": [
  90. {
  91. "eopt": ">",
  92. "func": "all",
  93. "metric": "cpu.loadavg.1",
  94. "params": [],
  95. "threshold": 16
  96. }
  97. ],
  98. "tags": [],
  99. "enable_days_of_week": [
  100. 0,
  101. 1,
  102. 2,
  103. 3,
  104. 4,
  105. 5,
  106. 6
  107. ],
  108. "converge": [
  109. 36000,
  110. 1
  111. ],
  112. "endpoints": null,
  113. "judge_instance": "",
  114. "work_groups": null
  115. },
  116. {
  117. "name": "某磁盘无法正常读写",
  118. "category": 1,
  119. "alert_dur": 60,
  120. "recovery_dur": 0,
  121. "recovery_notify": 1,
  122. "enable_stime": "00:00",
  123. "enable_etime": "23:59",
  124. "priority": 1,
  125. "runbook": "",
  126. "nids": null,
  127. "exprs": [
  128. {
  129. "eopt": ">",
  130. "func": "all",
  131. "metric": "disk.rw.error",
  132. "params": [],
  133. "threshold": 0
  134. }
  135. ],
  136. "tags": [],
  137. "enable_days_of_week": [
  138. 0,
  139. 1,
  140. 2,
  141. 3,
  142. 4,
  143. 5,
  144. 6
  145. ],
  146. "converge": [
  147. 36000,
  148. 1
  149. ],
  150. "endpoints": null,
  151. "judge_instance": "",
  152. "work_groups": null
  153. },
  154. {
  155. "name": "监控agent失联",
  156. "category": 1,
  157. "alert_dur": 60,
  158. "recovery_dur": 0,
  159. "recovery_notify": 1,
  160. "enable_stime": "00:00",
  161. "enable_etime": "23:59",
  162. "priority": 1,
  163. "runbook": "",
  164. "nids": null,
  165. "exprs": [
  166. {
  167. "eopt": "=",
  168. "func": "nodata",
  169. "metric": "proc.agent.alive",
  170. "params": [],
  171. "threshold": 0
  172. }
  173. ],
  174. "tags": [],
  175. "enable_days_of_week": [
  176. 0,
  177. 1,
  178. 2,
  179. 3,
  180. 4,
  181. 5,
  182. 6
  183. ],
  184. "converge": [
  185. 36000,
  186. 1
  187. ],
  188. "endpoints": null,
  189. "judge_instance": "",
  190. "work_groups": null
  191. },
  192. {
  193. "name": "磁盘利用率达到85%",
  194. "category": 1,
  195. "alert_dur": 60,
  196. "recovery_dur": 0,
  197. "recovery_notify": 1,
  198. "enable_stime": "00:00",
  199. "enable_etime": "23:59",
  200. "priority": 3,
  201. "runbook": "",
  202. "nids": null,
  203. "exprs": [
  204. {
  205. "eopt": ">",
  206. "func": "all",
  207. "metric": "disk.bytes.used.percent",
  208. "params": [],
  209. "threshold": 85
  210. }
  211. ],
  212. "tags": [],
  213. "enable_days_of_week": [
  214. 0,
  215. 1,
  216. 2,
  217. 3,
  218. 4,
  219. 5,
  220. 6
  221. ],
  222. "converge": [
  223. 36000,
  224. 1
  225. ],
  226. "endpoints": null,
  227. "judge_instance": "",
  228. "work_groups": null
  229. },
  230. {
  231. "name": "磁盘利用率达到88%",
  232. "category": 1,
  233. "alert_dur": 60,
  234. "recovery_dur": 0,
  235. "recovery_notify": 1,
  236. "enable_stime": "00:00",
  237. "enable_etime": "23:59",
  238. "priority": 2,
  239. "runbook": "",
  240. "nids": null,
  241. "exprs": [
  242. {
  243. "eopt": ">",
  244. "func": "all",
  245. "metric": "disk.bytes.used.percent",
  246. "params": [],
  247. "threshold": 88
  248. }
  249. ],
  250. "tags": [],
  251. "enable_days_of_week": [
  252. 0,
  253. 1,
  254. 2,
  255. 3,
  256. 4,
  257. 5,
  258. 6
  259. ],
  260. "converge": [
  261. 36000,
  262. 1
  263. ],
  264. "endpoints": null,
  265. "judge_instance": "",
  266. "work_groups": null
  267. },
  268. {
  269. "name": "磁盘利用率达到92%",
  270. "category": 1,
  271. "alert_dur": 60,
  272. "recovery_dur": 0,
  273. "recovery_notify": 1,
  274. "enable_stime": "00:00",
  275. "enable_etime": "23:59",
  276. "priority": 1,
  277. "runbook": "",
  278. "nids": null,
  279. "exprs": [
  280. {
  281. "eopt": ">",
  282. "func": "all",
  283. "metric": "disk.bytes.used.percent",
  284. "params": [],
  285. "threshold": 92
  286. }
  287. ],
  288. "tags": [],
  289. "enable_days_of_week": [
  290. 0,
  291. 1,
  292. 2,
  293. 3,
  294. 4,
  295. 5,
  296. 6
  297. ],
  298. "converge": [
  299. 36000,
  300. 1
  301. ],
  302. "endpoints": null,
  303. "judge_instance": "",
  304. "work_groups": null
  305. },
  306. {
  307. "name": "端口挂了",
  308. "category": 1,
  309. "alert_dur": 60,
  310. "recovery_dur": 0,
  311. "recovery_notify": 1,
  312. "enable_stime": "00:00",
  313. "enable_etime": "23:59",
  314. "priority": 2,
  315. "runbook": "",
  316. "nids": null,
  317. "exprs": [
  318. {
  319. "eopt": "!=",
  320. "func": "all",
  321. "metric": "proc.port.listen",
  322. "params": [],
  323. "threshold": 1
  324. }
  325. ],
  326. "tags": [],
  327. "enable_days_of_week": [
  328. 0,
  329. 1,
  330. 2,
  331. 3,
  332. 4,
  333. 5,
  334. 6
  335. ],
  336. "converge": [
  337. 36000,
  338. 1
  339. ],
  340. "endpoints": null,
  341. "judge_instance": "",
  342. "work_groups": null
  343. },
  344. {
  345. "name": "网卡入方向丢包",
  346. "category": 1,
  347. "alert_dur": 60,
  348. "recovery_dur": 0,
  349. "recovery_notify": 1,
  350. "enable_stime": "00:00",
  351. "enable_etime": "23:59",
  352. "priority": 2,
  353. "runbook": "",
  354. "nids": null,
  355. "exprs": [
  356. {
  357. "eopt": ">",
  358. "func": "all",
  359. "metric": "net.in.dropped",
  360. "params": [],
  361. "threshold": 3
  362. }
  363. ],
  364. "tags": [],
  365. "enable_days_of_week": [
  366. 0,
  367. 1,
  368. 2,
  369. 3,
  370. 4,
  371. 5,
  372. 6
  373. ],
  374. "converge": [
  375. 36000,
  376. 1
  377. ],
  378. "endpoints": null,
  379. "judge_instance": "",
  380. "work_groups": null
  381. },
  382. {
  383. "name": "网卡入方向错包",
  384. "category": 1,
  385. "alert_dur": 60,
  386. "recovery_dur": 0,
  387. "recovery_notify": 1,
  388. "enable_stime": "00:00",
  389. "enable_etime": "23:59",
  390. "priority": 2,
  391. "runbook": "",
  392. "nids": null,
  393. "exprs": [
  394. {
  395. "eopt": ">",
  396. "func": "all",
  397. "metric": "net.in.errs",
  398. "params": [],
  399. "threshold": 3
  400. }
  401. ],
  402. "tags": [],
  403. "enable_days_of_week": [
  404. 0,
  405. 1,
  406. 2,
  407. 3,
  408. 4,
  409. 5,
  410. 6
  411. ],
  412. "converge": [
  413. 36000,
  414. 1
  415. ],
  416. "endpoints": null,
  417. "judge_instance": "",
  418. "work_groups": null
  419. },
  420. {
  421. "name": "网卡出方向丢包",
  422. "category": 1,
  423. "alert_dur": 60,
  424. "recovery_dur": 0,
  425. "recovery_notify": 1,
  426. "enable_stime": "00:00",
  427. "enable_etime": "23:59",
  428. "priority": 2,
  429. "runbook": "",
  430. "nids": null,
  431. "exprs": [
  432. {
  433. "eopt": ">",
  434. "func": "all",
  435. "metric": "net.out.dropped",
  436. "params": [],
  437. "threshold": 3
  438. }
  439. ],
  440. "tags": [],
  441. "enable_days_of_week": [
  442. 0,
  443. 1,
  444. 2,
  445. 3,
  446. 4,
  447. 5,
  448. 6
  449. ],
  450. "converge": [
  451. 36000,
  452. 1
  453. ],
  454. "endpoints": null,
  455. "judge_instance": "",
  456. "work_groups": null
  457. },
  458. {
  459. "name": "网卡出方向错包",
  460. "category": 1,
  461. "alert_dur": 60,
  462. "recovery_dur": 0,
  463. "recovery_notify": 1,
  464. "enable_stime": "00:00",
  465. "enable_etime": "23:59",
  466. "priority": 2,
  467. "runbook": "",
  468. "nids": null,
  469. "exprs": [
  470. {
  471. "eopt": ">",
  472. "func": "all",
  473. "metric": "net.out.errs",
  474. "params": [],
  475. "threshold": 3
  476. }
  477. ],
  478. "tags": [],
  479. "enable_days_of_week": [
  480. 0,
  481. 1,
  482. 2,
  483. 3,
  484. 4,
  485. 5,
  486. 6
  487. ],
  488. "converge": [
  489. 36000,
  490. 1
  491. ],
  492. "endpoints": null,
  493. "judge_instance": "",
  494. "work_groups": null
  495. },
  496. {
  497. "name": "进程总数超过3000",
  498. "category": 1,
  499. "alert_dur": 60,
  500. "recovery_dur": 0,
  501. "recovery_notify": 1,
  502. "enable_stime": "00:00",
  503. "enable_etime": "23:59",
  504. "priority": 1,
  505. "runbook": "",
  506. "nids": null,
  507. "exprs": [
  508. {
  509. "eopt": ">",
  510. "func": "all",
  511. "metric": "sys.ps.process.total",
  512. "params": [],
  513. "threshold": 3000
  514. }
  515. ],
  516. "tags": [],
  517. "enable_days_of_week": [
  518. 0,
  519. 1,
  520. 2,
  521. 3,
  522. 4,
  523. 5,
  524. 6
  525. ],
  526. "converge": [
  527. 36000,
  528. 1
  529. ],
  530. "endpoints": null,
  531. "judge_instance": "",
  532. "work_groups": null
  533. },
  534. {
  535. "name": "进程挂了",
  536. "category": 1,
  537. "alert_dur": 60,
  538. "recovery_dur": 0,
  539. "recovery_notify": 1,
  540. "enable_stime": "00:00",
  541. "enable_etime": "23:59",
  542. "priority": 2,
  543. "runbook": "",
  544. "nids": null,
  545. "exprs": [
  546. {
  547. "eopt": "<",
  548. "func": "all",
  549. "metric": "proc.num",
  550. "params": [],
  551. "threshold": 1
  552. }
  553. ],
  554. "tags": [],
  555. "enable_days_of_week": [
  556. 0,
  557. 1,
  558. 2,
  559. 3,
  560. 4,
  561. 5,
  562. 6
  563. ],
  564. "converge": [
  565. 36000,
  566. 1
  567. ],
  568. "endpoints": null,
  569. "judge_instance": "",
  570. "work_groups": null
  571. }
  572. ]

最后修改 2021-02-06: v2 (bb04a83)