06k8s-install-centos.sh 104 KB


  1. #!/usr/bin/env bash
  2. # 参考 https://raw.githubusercontent.com/lework/kainstall/v1.4.9/kainstall-centos.sh
  3. [[ -n $DEBUG ]] && set -x
  4. set -o errtrace # Make sure any error trap is inherited
  5. set -o nounset # Disallow expansion of unset variables
  6. set -o pipefail # Use last non-zero exit code in a pipeline
  7. # 版本
  8. KUBE_VERSION="${KUBE_VERSION:-latest}"
  9. FLANNEL_VERSION="${FLANNEL_VERSION:-0.17.0}"
  10. CALICO_VERSION="${CALICO_VERSION:-3.22.1}"
  11. CILIUM_VERSION="${CILIUM_VERSION:-1.9.13}"
  12. HELM_VERSION="${HELM_VERSION:-3.10.1}"
  13. INGRESS_NGINX="${INGRESS_NGINX:-4.2.5}"
  14. RANCHER_VERSION="${RANCHER_VERSION:-2.6.8}"
  15. #METRICS_SERVER_VERSION="${METRICS_SERVER_VERSION:-0.6.1}"
  16. #KUBE_PROMETHEUS_VERSION="${KUBE_PROMETHEUS_VERSION:-0.10.0}"
  17. #ELASTICSEARCH_VERSION="${ELASTICSEARCH_VERSION:-8.1.0}"
  18. #ROOK_VERSION="${ROOK_VERSION:-1.8.7}"
  19. #LONGHORN_VERSION="${LONGHORN_VERSION:-1.2.4}"
  20. # 集群配置
  21. KUBE_DNSDOMAIN="${KUBE_DNSDOMAIN:-cluster.local}"
  22. KUBE_APISERVER="${KUBE_APISERVER:-apiserver.$KUBE_DNSDOMAIN}"
  23. KUBE_POD_SUBNET="${KUBE_POD_SUBNET:-10.244.0.0/16}"
  24. KUBE_SERVICE_SUBNET="${KUBE_SERVICE_SUBNET:-10.96.0.0/16}"
  25. KUBE_IMAGE_REPO="${KUBE_IMAGE_REPO:-registry.cn-hangzhou.aliyuncs.com/kainstall}"
  26. KUBE_NETWORK="${KUBE_NETWORK:-flannel}"
  27. KUBE_INGRESS="${KUBE_INGRESS:-nginx}"
  28. KUBE_MONITOR="${KUBE_MONITOR:-prometheus}"
  29. KUBE_STORAGE="${KUBE_STORAGE:-rook}"
  30. KUBE_LOG="${KUBE_LOG:-elasticsearch}"
  31. KUBE_UI="${KUBE_UI:-dashboard}"
  32. KUBE_ADDON="${KUBE_ADDON:-metrics-server}"
  33. KUBE_FLANNEL_TYPE="${KUBE_FLANNEL_TYPE:-vxlan}"
  34. KUBE_CRI="${KUBE_CRI:-docker}"
  35. KUBE_CRI_VERSION="${KUBE_CRI_VERSION:-latest}"
  36. KUBE_CRI_ENDPOINT="${KUBE_CRI_ENDPOINT:-/var/run/dockershim.sock}"
  37. # 定义的master和worker节点地址,以逗号分隔
  38. MASTER_NODES="${MASTER_NODES:-}"
  39. WORKER_NODES="${WORKER_NODES:-}"
  40. # 定义在哪个节点上进行设置
  41. MGMT_NODE="${MGMT_NODE:-127.0.0.1}"
  42. # 节点的连接信息
  43. SSH_USER="${SSH_USER:-root}"
  44. SSH_PASSWORD="${SSH_PASSWORD:-}"
  45. SSH_PRIVATE_KEY="${SSH_PRIVATE_KEY:-}"
  46. SSH_PORT="${SSH_PORT:-22}"
  47. SUDO_USER="${SUDO_USER:-root}"
  48. # 节点设置
  49. HOSTNAME_PREFIX="${HOSTNAME_PREFIX:-k8s}"
  50. # nginx的端口配置
  51. NGINX_HTTP_PORT="${NGINX_HTTP_PORT:-80}"
  52. # 脚本设置
  53. TMP_DIR="$(rm -rf /tmp/kainstall* && mktemp -d -t kainstall.XXXXXXXXXX)"
  54. LOG_FILE="${TMP_DIR}/kainstall.log"
  55. SSH_OPTIONS="-o ConnectTimeout=600 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
  56. ERROR_INFO="\n\033[31mERROR Summary: \033[0m\n "
  57. ACCESS_INFO="\n\033[32mACCESS Summary: \033[0m\n "
  58. COMMAND_OUTPUT=""
  59. SCRIPT_PARAMETER="$*"
  60. OFFLINE_DIR="/tmp/kainstall-offline-file/"
  61. OFFLINE_FILE=""
  62. OS_SUPPORT="centos7 centos8"
  63. GITHUB_PROXY="${GITHUB_PROXY:-https://ghproxy.com/}"
  64. GCR_PROXY="${GCR_PROXY:-k8sgcr.lework.workers.dev}"
  65. SKIP_UPGRADE_PLAN=${SKIP_UPGRADE_PLAN:-false}
  66. SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  67. trap trap::info 1 2 3 15 EXIT
  68. ######################################################################################################
  69. # 通用函数
  70. ######################################################################################################
  71. # 信号处理
  72. function trap::info() {
  73. [[ ${#ERROR_INFO} -gt 37 ]] && echo -e "$ERROR_INFO"
  74. [[ ${#ACCESS_INFO} -gt 38 ]] && echo -e "$ACCESS_INFO"
  75. [ -f "$LOG_FILE" ] && echo -e "\n\n See detailed log >>> $LOG_FILE \n\n"
  76. trap '' EXIT
  77. exit
  78. }
  79. # 错误日志
  80. function log::error() {
  81. local item; item="[$(date +'%Y-%m-%dT%H:%M:%S.%N%z')]: \033[31mERROR: \033[0m$*"
  82. ERROR_INFO="${ERROR_INFO}${item}\n "
  83. echo -e "${item}" | tee -a "$LOG_FILE"
  84. }
  85. # 基础日志
  86. function log::info() {
  87. printf "[%s]: \033[32mINFO: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  88. }
  89. # 警告日志
  90. function log::warning() {
  91. printf "[%s]: \033[33mWARNING: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  92. }
  93. # 访问信息
  94. function log::access() {
  95. ACCESS_INFO="${ACCESS_INFO}$*\n "
  96. printf "[%s]: \033[32mINFO: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  97. }
  98. # 执行日志
  99. function log::exec() {
  100. printf "[%s]: \033[34mEXEC: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" >> "$LOG_FILE"
  101. }
  102. # 版本号转数字
  103. function utils::version_to_number() {
  104. echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }';
  105. }
  106. # 重试
  107. function utils::retry() {
  108. local retries=$1
  109. shift
  110. local count=0
  111. until eval "$*"; do
  112. exit=$?
  113. wait=$((2 ** count))
  114. count=$((count + 1))
  115. if [ "$count" -lt "$retries" ]; then
  116. echo "Retry $count/$retries exited $exit, retrying in $wait seconds..."
  117. sleep $wait
  118. else
  119. echo "Retry $count/$retries exited $exit, no more retries left."
  120. return $exit
  121. fi
  122. done
  123. return 0
  124. }
  125. # 转义引号
  126. function utils::quote() {
  127. # shellcheck disable=SC2046
  128. if [ $(echo "$*" | tr -d "\n" | wc -c) -eq 0 ]; then
  129. echo "''"
  130. elif [ $(echo "$*" | tr -d "[a-z][A-Z][0-9]:,.=~_/\n-" | wc -c) -gt 0 ]; then
  131. printf "%s" "$*" | sed -e "1h;2,\$H;\$!d;g" -e "s/'/\'\"\'\"\'/g" | sed -e "1h;2,\$H;\$!d;g" -e "s/^/'/g" -e "s/$/'/g"
  132. else
  133. echo "$*"
  134. fi
  135. }
  136. # 下载文件
  137. function utils::download_file() {
  138. local url="$1"
  139. local dest="$2"
  140. local unzip_tag="${3:-1}"
  141. local dest_dirname; dest_dirname=$(dirname "$dest")
  142. local filename; filename=$(basename "$dest")
  143. log::info "[download]" "${filename}"
  144. command::exec "${MGMT_NODE}" "
  145. set -e
  146. if [ ! -f \"${dest}\" ]; then
  147. [ ! -d \"${dest_dirname}\" ] && mkdir -pv \"${dest_dirname}\"
  148. wget --timeout=10 --waitretry=3 --tries=5 --retry-connrefused --no-check-certificate \"${url}\" -O \"${dest}\"
  149. if [[ \"${unzip_tag}\" == \"unzip\" ]]; then
  150. command -v unzip 2>/dev/null || yum install -y unzip
  151. unzip -o \"${dest}\" -d \"${dest_dirname}\"
  152. fi
  153. else
  154. echo \"${dest} is exists!\"
  155. fi
  156. "
  157. local status="$?"
  158. check::exit_code "$status" "download" "${filename}" "exit"
  159. return "$status"
  160. }
  161. # 判断是否在数组中存在元素
  162. function utils::is_element_in_array() {
  163. local -r element="${1}"
  164. local -r array=("${@:2}")
  165. local walker=''
  166. for walker in "${array[@]}"
  167. do
  168. [[ "${walker}" = "${element}" ]] && return 0
  169. done
  170. return 1
  171. }
  172. # 执行命令
  173. function command::exec() {
  174. local host=${1:-}
  175. shift
  176. local command="$*"
  177. if [[ "${SUDO_TAG:-}" == "1" ]]; then
  178. sudo_options="sudo -H -n -u ${SUDO_USER}"
  179. if [[ "${SUDO_PASSWORD:-}" != "" ]]; then
  180. sudo_options="${sudo_options// -n/} -p \"\" -S <<< \"${SUDO_PASSWORD}\""
  181. fi
  182. command="$sudo_options bash -c $(utils::quote "$command")"
  183. fi
  184. command="$(utils::quote "$command")"
  185. if [[ "${host}" == "127.0.0.1" ]]; then
  186. # 本地执行
  187. log::exec "[command]" "bash -c $(printf "%s" "${command//${SUDO_PASSWORD:-}/zzzzzz}")"
  188. # shellcheck disable=SC2094
  189. COMMAND_OUTPUT=$(eval bash -c "${command}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  190. local status=$?
  191. else
  192. # 远程执行
  193. local ssh_cmd="ssh"
  194. if [[ "${SSH_PASSWORD}" != "" ]]; then
  195. ssh_cmd="sshpass -p \"${SSH_PASSWORD}\" ${ssh_cmd}"
  196. elif [[ "$SSH_PRIVATE_KEY" != "" ]]; then
  197. [ -f "${SSH_PRIVATE_KEY}" ] || { log::error "[exec]" "ssh private_key:${SSH_PRIVATE_KEY} not found."; exit 1; }
  198. ssh_cmd="${ssh_cmd} -i $SSH_PRIVATE_KEY"
  199. fi
  200. log::exec "[command]" "${ssh_cmd//${SSH_PASSWORD:-}/zzzzzz} ${SSH_OPTIONS} ${SSH_USER}@${host} -p ${SSH_PORT} bash -c $(printf "%s" "${command//${SUDO_PASSWORD:-}/zzzzzz}")"
  201. # shellcheck disable=SC2094
  202. COMMAND_OUTPUT=$(eval "${ssh_cmd} ${SSH_OPTIONS} ${SSH_USER}@${host} -p ${SSH_PORT}" bash -c '"${command}"' 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  203. local status=$?
  204. fi
  205. return $status
  206. }
  207. # 拷贝文件
  208. function command::scp() {
  209. local host=${1:-}
  210. local src=${2:-}
  211. local dest=${3:-/tmp/}
  212. if [[ "${host}" == "127.0.0.1" ]]; then
  213. local command="cp -rf ${src} ${dest}"
  214. log::exec "[command]" "bash -c \"${command}\""
  215. # shellcheck disable=SC2094
  216. COMMAND_OUTPUT=$(bash -c "${command}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  217. local status=$?
  218. else
  219. local scp_cmd="scp"
  220. if [[ "${SSH_PASSWORD}" != "" ]]; then
  221. scp_cmd="sshpass -p \"${SSH_PASSWORD}\" ${scp_cmd}"
  222. elif [[ "$SSH_PRIVATE_KEY" != "" ]]; then
  223. [ -f "${SSH_PRIVATE_KEY}" ] || { log::error "[exec]" "ssh private_key:${SSH_PRIVATE_KEY} not found."; exit 1; }
  224. scp_cmd="${scp_cmd} -i $SSH_PRIVATE_KEY"
  225. fi
  226. log::exec "[command]" "${scp_cmd} ${SSH_OPTIONS} -P ${SSH_PORT} -r ${src} ${SSH_USER}@${host}:${dest}" >> "$LOG_FILE"
  227. # shellcheck disable=SC2094
  228. COMMAND_OUTPUT=$(eval "${scp_cmd} ${SSH_OPTIONS} -P ${SSH_PORT} -r ${src} ${SSH_USER}@${host}:${dest}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  229. local status=$?
  230. fi
  231. return $status
  232. }
  233. # 检查命令是否存在
  234. function check::command_exists() {
  235. local cmd=${1}
  236. local package=${2}
  237. if command -V "$cmd" > /dev/null 2>&1; then
  238. log::info "[check]" "$cmd command exists."
  239. else
  240. log::warning "[check]" "I require $cmd but it's not installed."
  241. log::warning "[check]" "install $package package."
  242. command::exec "127.0.0.1" "yum install -y ${package}"
  243. check::exit_code "$?" "check" "$package install" "exit"
  244. fi
  245. }
  246. ######################################################################################################
  247. # 安装函数
  248. ######################################################################################################
  249. # 节点初始化脚本
  250. function script::init_node() {
  251. # clean
  252. sed -i -e "/$KUBE_APISERVER/d" -e '/-worker-/d' -e '/-master-/d' /etc/hosts
  253. sed -i '/## Kainstall managed start/,/## Kainstall managed end/d' /etc/security/limits.conf /etc/systemd/system.conf /etc/bashrc /etc/rc.local /etc/audit/rules.d/audit.rules
  254. # Disable selinux
  255. sed -i '/SELINUX/s/enforcing/disabled/' /etc/selinux/config
  256. setenforce 0
  257. # Disable swap
  258. swapoff -a && sysctl -w vm.swappiness=0
  259. sed -ri '/^[^#]*swap/s@^@#@' /etc/fstab
  260. # Disable firewalld
  261. for target in firewalld python-firewall firewalld-filesystem iptables; do
  262. systemctl stop $target &>/dev/null || true
  263. systemctl disable $target &>/dev/null || true
  264. done
  265. # repo
  266. [[ -f /etc/yum.repos.d/CentOS-Base.repo && "${SKIP_SET_OS_REPO,,}" == "false" ]] && sed -e 's!^#baseurl=!baseurl=!g' \
  267. -e 's!^mirrorlist=!#mirrorlist=!g' \
  268. -e 's!mirror.centos.org!mirrors.aliyun.com!g' \
  269. -i /etc/yum.repos.d/CentOS-Base.repo
  270. [[ "${OFFLINE_TAG:-}" != "1" && "${SKIP_SET_OS_REPO,,}" == "false" ]] && yum install -y epel-release
  271. [[ -f /etc/yum.repos.d/epel.repo && "${SKIP_SET_OS_REPO,,}" == "false" ]] && sed -e 's!^mirrorlist=!#mirrorlist=!g' \
  272. -e 's!^metalink=!#metalink=!g' \
  273. -e 's!^#baseurl=!baseurl=!g' \
  274. -e 's!//download.*/pub!//mirrors.aliyun.com!g' \
  275. -e 's!http://mirrors\.aliyun!https://mirrors.aliyun!g' \
  276. -i /etc/yum.repos.d/epel.repo
  277. # Change limits
  278. [ ! -f /etc/security/limits.conf_bak ] && cp /etc/security/limits.conf{,_bak}
  279. cat << EOF >> /etc/security/limits.conf
  280. ## Kainstall managed start
  281. root soft nofile 655360
  282. root hard nofile 655360
  283. root soft nproc 655360
  284. root hard nproc 655360
  285. root soft core unlimited
  286. root hard core unlimited
  287. * soft nofile 655360
  288. * hard nofile 655360
  289. * soft nproc 655360
  290. * hard nproc 655360
  291. * soft core unlimited
  292. * hard core unlimited
  293. ## Kainstall managed end
  294. EOF
  295. # /etc/systemd/system.conf
  296. [ -f /etc/security/limits.d/20-nproc.conf ] && sed -i 's#4096#655360#g' /etc/security/limits.d/20-nproc.conf
  297. cat << EOF >> /etc/systemd/system.conf
  298. ## Kainstall managed start
  299. DefaultLimitCORE=infinity
  300. DefaultLimitNOFILE=655360
  301. DefaultLimitNPROC=655360
  302. DefaultTasksMax=75%
  303. ## Kainstall managed end
  304. EOF
  305. # Change sysctl
  306. cat << EOF > /etc/sysctl.d/99-kube.conf
  307. # https://www.kernel.org/doc/Documentation/sysctl/
  308. #############################################################################################
  309. # 调整虚拟内存
  310. #############################################################################################
  311. # Default: 30
  312. # 0 - 任何情况下都不使用swap。
  313. # 1 - 除非内存不足(OOM),否则不使用swap。
  314. vm.swappiness = 0
  315. # 内存分配策略
  316. #0 - 表示内核将检查是否有足够的可用内存供应用进程使用;如果有足够的可用内存,内存申请允许;否则,内存申请失败,并把错误返回给应用进程。
  317. #1 - 表示内核允许分配所有的物理内存,而不管当前的内存状态如何。
  318. #2 - 表示内核允许分配超过所有物理内存和交换空间总和的内存
  319. vm.overcommit_memory=1
  320. # OOM时处理
  321. # 1关闭,等于0时,表示当内存耗尽时,内核会触发OOM killer杀掉最耗内存的进程。
  322. vm.panic_on_oom=0
  323. # vm.dirty_background_ratio 用于调整内核如何处理必须刷新到磁盘的脏页。
  324. # Default value is 10.
  325. # 该值是系统内存总量的百分比,在许多情况下将此值设置为5是合适的。
  326. # 此设置不应设置为零。
  327. vm.dirty_background_ratio = 5
  328. # 内核强制同步操作将其刷新到磁盘之前允许的脏页总数
  329. # 也可以通过更改 vm.dirty_ratio 的值(将其增加到默认值30以上(也占系统内存的百分比))来增加
  330. # 推荐 vm.dirty_ratio 的值在60到80之间。
  331. vm.dirty_ratio = 60
  332. # vm.max_map_count 计算当前的内存映射文件数。
  333. # mmap 限制(vm.max_map_count)的最小值是打开文件的ulimit数量(cat /proc/sys/fs/file-max)。
  334. # 每128KB系统内存 map_count应该大约为1。 因此,在32GB系统上,max_map_count为262144。
  335. # Default: 65530
  336. vm.max_map_count = 2097152
  337. #############################################################################################
  338. # 调整文件
  339. #############################################################################################
  340. fs.may_detach_mounts = 1
  341. # 增加文件句柄和inode缓存的大小,并限制核心转储。
  342. fs.file-max = 2097152
  343. fs.nr_open = 2097152
  344. fs.suid_dumpable = 0
  345. # 文件监控
  346. fs.inotify.max_user_instances=8192
  347. fs.inotify.max_user_watches=524288
  348. fs.inotify.max_queued_events=16384
  349. #############################################################################################
  350. # 调整网络设置
  351. #############################################################################################
  352. # 为每个套接字的发送和接收缓冲区分配的默认内存量。
  353. net.core.wmem_default = 25165824
  354. net.core.rmem_default = 25165824
  355. # 为每个套接字的发送和接收缓冲区分配的最大内存量。
  356. net.core.wmem_max = 25165824
  357. net.core.rmem_max = 25165824
  358. # 除了套接字设置外,发送和接收缓冲区的大小
  359. # 必须使用net.ipv4.tcp_wmem和net.ipv4.tcp_rmem参数分别设置TCP套接字。
  360. # 使用三个以空格分隔的整数设置这些整数,分别指定最小,默认和最大大小。
  361. # 最大大小不能大于使用net.core.wmem_max和net.core.rmem_max为所有套接字指定的值。
  362. # 合理的设置是最小4KiB,默认64KiB和最大2MiB缓冲区。
  363. net.ipv4.tcp_wmem = 20480 12582912 25165824
  364. net.ipv4.tcp_rmem = 20480 12582912 25165824
  365. # 增加最大可分配的总缓冲区空间
  366. # 以页为单位(4096字节)进行度量
  367. net.ipv4.tcp_mem = 65536 25165824 262144
  368. net.ipv4.udp_mem = 65536 25165824 262144
  369. # 为每个套接字的发送和接收缓冲区分配的最小内存量。
  370. net.ipv4.udp_wmem_min = 16384
  371. net.ipv4.udp_rmem_min = 16384
  372. # 启用TCP窗口缩放,客户端可以更有效地传输数据,并允许在代理方缓冲该数据。
  373. net.ipv4.tcp_window_scaling = 1
  374. # 提高同时接受连接数。
  375. net.ipv4.tcp_max_syn_backlog = 10240
  376. # 将net.core.netdev_max_backlog的值增加到大于默认值1000
  377. # 可以帮助突发网络流量,特别是在使用数千兆位网络连接速度时,
  378. # 通过允许更多的数据包排队等待内核处理它们。
  379. net.core.netdev_max_backlog = 65536
  380. # 增加选项内存缓冲区的最大数量
  381. net.core.optmem_max = 25165824
  382. # 被动TCP连接的SYNACK次数。
  383. net.ipv4.tcp_synack_retries = 2
  384. # 允许的本地端口范围。
  385. net.ipv4.ip_local_port_range = 2048 65535
  386. # 防止TCP时间等待
  387. # Default: net.ipv4.tcp_rfc1337 = 0
  388. net.ipv4.tcp_rfc1337 = 1
  389. # 减少tcp_fin_timeout连接的时间默认值
  390. net.ipv4.tcp_fin_timeout = 15
  391. # 积压套接字的最大数量。
  392. # Default is 128.
  393. net.core.somaxconn = 32768
  394. # 打开syncookies以进行SYN洪水攻击保护。
  395. net.ipv4.tcp_syncookies = 1
  396. # 避免Smurf攻击
  397. # 发送伪装的ICMP数据包,目的地址设为某个网络的广播地址,源地址设为要攻击的目的主机,
  398. # 使所有收到此ICMP数据包的主机都将对目的主机发出一个回应,使被攻击主机在某一段时间内收到成千上万的数据包
  399. net.ipv4.icmp_echo_ignore_broadcasts = 1
  400. # 为icmp错误消息打开保护
  401. net.ipv4.icmp_ignore_bogus_error_responses = 1
  402. # 启用自动缩放窗口。
  403. # 如果延迟证明合理,这将允许TCP缓冲区超过其通常的最大值64K。
  404. net.ipv4.tcp_window_scaling = 1
  405. # 打开并记录欺骗,源路由和重定向数据包
  406. net.ipv4.conf.all.log_martians = 1
  407. net.ipv4.conf.default.log_martians = 1
  408. # 告诉内核有多少个未附加的TCP套接字维护用户文件句柄。 万一超过这个数字,
  409. # 孤立的连接会立即重置,并显示警告。
  410. # Default: net.ipv4.tcp_max_orphans = 65536
  411. net.ipv4.tcp_max_orphans = 65536
  412. # 不要在关闭连接时缓存指标
  413. net.ipv4.tcp_no_metrics_save = 1
  414. # 启用RFC1323中定义的时间戳记:
  415. # Default: net.ipv4.tcp_timestamps = 1
  416. net.ipv4.tcp_timestamps = 1
  417. # 启用选择确认。
  418. # Default: net.ipv4.tcp_sack = 1
  419. net.ipv4.tcp_sack = 1
  420. # 增加 tcp-time-wait 存储桶池大小,以防止简单的DOS攻击。
  421. # net.ipv4.tcp_tw_recycle 已从Linux 4.12中删除。请改用net.ipv4.tcp_tw_reuse。
  422. net.ipv4.tcp_max_tw_buckets = 14400
  423. net.ipv4.tcp_tw_reuse = 1
  424. # accept_source_route 选项使网络接口接受设置了严格源路由(SSR)或松散源路由(LSR)选项的数据包。
  425. # 以下设置将丢弃设置了SSR或LSR选项的数据包。
  426. net.ipv4.conf.all.accept_source_route = 0
  427. net.ipv4.conf.default.accept_source_route = 0
  428. # 打开反向路径过滤
  429. net.ipv4.conf.all.rp_filter = 1
  430. net.ipv4.conf.default.rp_filter = 1
  431. # 禁用ICMP重定向接受
  432. net.ipv4.conf.all.accept_redirects = 0
  433. net.ipv4.conf.default.accept_redirects = 0
  434. net.ipv4.conf.all.secure_redirects = 0
  435. net.ipv4.conf.default.secure_redirects = 0
  436. # 禁止发送所有IPv4 ICMP重定向数据包。
  437. net.ipv4.conf.all.send_redirects = 0
  438. net.ipv4.conf.default.send_redirects = 0
  439. # 开启IP转发.
  440. net.ipv4.ip_forward = 1
  441. # 禁止IPv6
  442. net.ipv6.conf.lo.disable_ipv6=1
  443. net.ipv6.conf.all.disable_ipv6 = 1
  444. net.ipv6.conf.default.disable_ipv6 = 1
  445. # 要求iptables不对bridge的数据进行处理
  446. net.bridge.bridge-nf-call-ip6tables = 1
  447. net.bridge.bridge-nf-call-iptables = 1
  448. net.bridge.bridge-nf-call-arptables = 1
  449. # arp缓存
  450. # 存在于 ARP 高速缓存中的最少层数,如果少于这个数,垃圾收集器将不会运行。缺省值是 128
  451. net.ipv4.neigh.default.gc_thresh1=2048
  452. # 保存在 ARP 高速缓存中的最多的记录软限制。垃圾收集器在开始收集前,允许记录数超过这个数字 5 秒。缺省值是 512
  453. net.ipv4.neigh.default.gc_thresh2=4096
  454. # 保存在 ARP 高速缓存中的最多记录的硬限制,一旦高速缓存中的数目高于此,垃圾收集器将马上运行。缺省值是 1024
  455. net.ipv4.neigh.default.gc_thresh3=8192
  456. # 持久连接
  457. net.ipv4.tcp_keepalive_time = 600
  458. net.ipv4.tcp_keepalive_intvl = 30
  459. net.ipv4.tcp_keepalive_probes = 10
  460. # conntrack表
  461. net.nf_conntrack_max=1048576
  462. net.netfilter.nf_conntrack_max=1048576
  463. net.netfilter.nf_conntrack_buckets=262144
  464. net.netfilter.nf_conntrack_tcp_timeout_fin_wait=30
  465. net.netfilter.nf_conntrack_tcp_timeout_time_wait=30
  466. net.netfilter.nf_conntrack_tcp_timeout_close_wait=15
  467. net.netfilter.nf_conntrack_tcp_timeout_established=300
  468. #############################################################################################
  469. # 调整内核参数
  470. #############################################################################################
  471. # 地址空间布局随机化(ASLR)是一种用于操作系统的内存保护过程,可防止缓冲区溢出攻击。
  472. # 这有助于确保与系统上正在运行的进程相关联的内存地址不可预测,
  473. # 因此,与这些流程相关的缺陷或漏洞将更加难以利用。
  474. # Accepted values: 0 = 关闭, 1 = 保守随机化, 2 = 完全随机化
  475. kernel.randomize_va_space = 2
  476. # 调高 PID 数量
  477. kernel.pid_max = 65536
  478. kernel.threads-max=30938
  479. # coredump
  480. kernel.core_pattern=core
  481. # 决定了检测到soft lockup时是否自动panic,缺省值是0
  482. kernel.softlockup_all_cpu_backtrace=1
  483. kernel.softlockup_panic=1
  484. EOF
  485. # history
  486. cat << EOF >> /etc/bashrc
  487. ## Kainstall managed start
  488. # history actions record,include action time, user, login ip
  489. HISTFILESIZE=5000
  490. HISTSIZE=5000
  491. USER_IP=\$(who -u am i 2>/dev/null | awk '{print \$NF}' | sed -e 's/[()]//g')
  492. if [ -z \$USER_IP ]
  493. then
  494. USER_IP=\$(hostname -i)
  495. fi
  496. HISTTIMEFORMAT="%Y-%m-%d %H:%M:%S \$USER_IP:\$(whoami) "
  497. export HISTFILESIZE HISTSIZE HISTTIMEFORMAT
  498. # PS1
  499. PS1='\[\033[0m\]\[\033[1;36m\][\u\[\033[0m\]@\[\033[1;32m\]\h\[\033[0m\] \[\033[1;31m\]\w\[\033[0m\]\[\033[1;36m\]]\[\033[33;1m\]\\$ \[\033[0m\]'
  500. ## Kainstall managed end
  501. EOF
  502. # journal
  503. mkdir -p /var/log/journal /etc/systemd/journald.conf.d
  504. cat << EOF > /etc/systemd/journald.conf.d/99-prophet.conf
  505. [Journal]
  506. # 持久化保存到磁盘
  507. Storage=persistent
  508. # 压缩历史日志
  509. Compress=yes
  510. SyncIntervalSec=5m
  511. RateLimitInterval=30s
  512. RateLimitBurst=1000
  513. # 最大占用空间 2G
  514. SystemMaxUse=2G
  515. # 单日志文件最大 100M
  516. SystemMaxFileSize=100M
  517. # 日志保存时间 3 周
  518. MaxRetentionSec=3week
  519. # 不将日志转发到 syslog
  520. ForwardToSyslog=no
  521. EOF
  522. # motd
  523. cat << EOF > /etc/profile.d/zz-ssh-login-info.sh
  524. #!/bin/sh
  525. #
  526. # @Time : 2020-02-04
  527. # @Author : lework
  528. # @Desc : ssh login banner
  529. export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:\$PATH
  530. #shopt -q login_shell && : || return 0
  531. # os
  532. upSeconds="\$(cut -d. -f1 /proc/uptime)"
  533. secs=\$((\${upSeconds}%60))
  534. mins=\$((\${upSeconds}/60%60))
  535. hours=\$((\${upSeconds}/3600%24))
  536. days=\$((\${upSeconds}/86400))
  537. UPTIME_INFO=\$(printf "%d days, %02dh %02dm %02ds" "\$days" "\$hours" "\$mins" "\$secs")
  538. if [ -f /etc/redhat-release ] ; then
  539. PRETTY_NAME=\$(< /etc/redhat-release)
  540. elif [ -f /etc/debian_version ]; then
  541. DIST_VER=\$(</etc/debian_version)
  542. PRETTY_NAME="\$(grep PRETTY_NAME /etc/os-release | sed -e 's/PRETTY_NAME=//g' -e 's/"//g') (\$DIST_VER)"
  543. else
  544. PRETTY_NAME=\$(cat /etc/*-release | grep "PRETTY_NAME" | sed -e 's/PRETTY_NAME=//g' -e 's/"//g')
  545. fi
  546. if [[ -d "/system/app/" && -d "/system/priv-app" ]]; then
  547. model="\$(getprop ro.product.brand) \$(getprop ro.product.model)"
  548. elif [[ -f /sys/devices/virtual/dmi/id/product_name ||
  549. -f /sys/devices/virtual/dmi/id/product_version ]]; then
  550. model="\$(< /sys/devices/virtual/dmi/id/product_name)"
  551. model+=" \$(< /sys/devices/virtual/dmi/id/product_version)"
  552. elif [[ -f /sys/firmware/devicetree/base/model ]]; then
  553. model="\$(< /sys/firmware/devicetree/base/model)"
  554. elif [[ -f /tmp/sysinfo/model ]]; then
  555. model="\$(< /tmp/sysinfo/model)"
  556. fi
  557. MODEL_INFO=\${model}
  558. KERNEL=\$(uname -srmo)
  559. USER_NUM=\$(who -u | wc -l)
  560. RUNNING=\$(ps ax | wc -l | tr -d " ")
  561. # disk
  562. totaldisk=\$(df -h -x devtmpfs -x tmpfs -x debugfs -x aufs -x overlay --total 2>/dev/null | tail -1)
  563. disktotal=\$(awk '{print \$2}' <<< "\${totaldisk}")
  564. diskused=\$(awk '{print \$3}' <<< "\${totaldisk}")
  565. diskusedper=\$(awk '{print \$5}' <<< "\${totaldisk}")
  566. DISK_INFO="\033[0;33m\${diskused}\033[0m of \033[1;34m\${disktotal}\033[0m disk space used (\033[0;33m\${diskusedper}\033[0m)"
  567. # cpu
  568. cpu=\$(awk -F':' '/^model name/ {print \$2}' /proc/cpuinfo | uniq | sed -e 's/^[ \t]*//')
  569. cpun=\$(grep -c '^processor' /proc/cpuinfo)
  570. cpuc=\$(grep '^cpu cores' /proc/cpuinfo | tail -1 | awk '{print \$4}')
  571. cpup=\$(grep '^physical id' /proc/cpuinfo | wc -l)
  572. CPU_INFO="\${cpu} \${cpup}P \${cpuc}C \${cpun}L"
  573. # get the load averages
  574. read one five fifteen rest < /proc/loadavg
  575. LOADAVG_INFO="\033[0;33m\${one}\033[0m / \${five} / \${fifteen} with \033[1;34m\$(( cpun*cpuc ))\033[0m core(s) at \033[1;34m\$(grep '^cpu MHz' /proc/cpuinfo | tail -1 | awk '{print \$4}')\033 MHz"
  576. # mem
  577. MEM_INFO="\$(cat /proc/meminfo | awk '/MemTotal:/{total=\$2/1024/1024;next} /MemAvailable:/{use=total-\$2/1024/1024; printf("\033[0;33m%.2fGiB\033[0m of \033[1;34m%.2fGiB\033[0m RAM used (\033[0;33m%.2f%%\033[0m)",use,total,(use/total)*100);}')"
  578. # network
  579. # extranet_ip=" and \$(curl -s ip.cip.cc)"
  580. IP_INFO="\$(ip a|grep -E '^[0-9]+: em*|^[0-9]+: eno*|^[0-9]+: enp*|^[0-9]+: ens*|^[0-9]+: eth*|^[0-9]+: wlp*' -A2|grep inet|awk -F ' ' '{print $2}'|cut -f1 -d/|xargs echo)"
  581. # Container info
  582. CONTAINER_INFO="\$(sudo /usr/bin/crictl ps -a -o yaml 2> /dev/null | awk '/^ state: /{gsub("CONTAINER_", "", \$NF) ++S[\$NF]}END{for(m in S) printf "%s%s:%s ",substr(m,1,1),tolower(substr(m,2)),S[m]}')Images:\$(sudo /usr/bin/crictl images -q 2> /dev/null | wc -l)"
  583. # info
  584. echo -e "
  585. Information as of: \033[1;34m\$(date +"%Y-%m-%d %T")\033[0m
  586. \033[0;1;31mProduct\033[0m............: \${MODEL_INFO}
  587. \033[0;1;31mOS\033[0m.................: \${PRETTY_NAME}
  588. \033[0;1;31mKernel\033[0m.............: \${KERNEL}
  589. \033[0;1;31mCPU\033[0m................: \${CPU_INFO}
  590. \033[0;1;31mHostname\033[0m...........: \033[1;34m\$(hostname)\033[0m
  591. \033[0;1;31mIP Addresses\033[0m.......: \033[1;34m\${IP_INFO}\033[0m
  592. \033[0;1;31mUptime\033[0m.............: \033[0;33m\${UPTIME_INFO}\033[0m
  593. \033[0;1;31mMemory\033[0m.............: \${MEM_INFO}
  594. \033[0;1;31mLoad Averages\033[0m......: \${LOADAVG_INFO}
  595. \033[0;1;31mDisk Usage\033[0m.........: \${DISK_INFO}
  596. \033[0;1;31mUsers online\033[0m.......: \033[1;34m\${USER_NUM}\033[0m
  597. \033[0;1;31mRunning Processes\033[0m..: \033[1;34m\${RUNNING}\033[0m
  598. \033[0;1;31mContainer Info\033[0m.....: \${CONTAINER_INFO}
  599. "
  600. EOF
  601. chmod +x /etc/profile.d/zz-ssh-login-info.sh
  602. echo 'ALL ALL=(ALL) NOPASSWD:/usr/bin/crictl' > /etc/sudoers.d/crictl
  603. # time sync
  604. ntpd --help >/dev/null 2>&1 && yum remove -y ntp
  605. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y chrony
  606. [ ! -f /etc/chrony.conf_bak ] && cp /etc/chrony.conf{,_bak} #备份默认配置
  607. cat << EOF > /etc/chrony.conf
  608. server ntp.aliyun.com iburst
  609. server cn.ntp.org.cn iburst
  610. server ntp.shu.edu.cn iburst
  611. server 0.cn.pool.ntp.org iburst
  612. server 1.cn.pool.ntp.org iburst
  613. server 2.cn.pool.ntp.org iburst
  614. server 3.cn.pool.ntp.org iburst
  615. driftfile /var/lib/chrony/drift
  616. makestep 1.0 3
  617. logdir /var/log/chrony
  618. EOF
  619. timedatectl set-timezone Asia/Shanghai
  620. chronyd -q -t 1 'server cn.pool.ntp.org iburst maxsamples 1'
  621. systemctl enable chronyd
  622. systemctl start chronyd
  623. chronyc sources -v
  624. chronyc sourcestats
  625. hwclock --systohc
  626. # package
  627. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y curl wget
  628. # ipvs
  629. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y ipvsadm ipset sysstat conntrack libseccomp
  630. module=(
  631. ip_vs
  632. ip_vs_rr
  633. ip_vs_wrr
  634. ip_vs_sh
  635. overlay
  636. nf_conntrack
  637. br_netfilter
  638. )
  639. [ -f /etc/modules-load.d/ipvs.conf ] && cp -f /etc/modules-load.d/ipvs.conf{,_bak}
  640. for kernel_module in "${module[@]}";do
  641. /sbin/modinfo -F filename "$kernel_module" |& grep -qv ERROR && echo "$kernel_module" >> /etc/modules-load.d/ipvs.conf
  642. done
  643. systemctl restart systemd-modules-load
  644. systemctl enable systemd-modules-load
  645. sysctl --system
  646. # audit
  647. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y audit audit-libs
  648. # /etc/audit/rules.d/audit.rules
  649. cat << EOF >> /etc/audit/rules.d/audit.rules
  650. ## Kainstall managed start
  651. # Ignore errors
  652. -i
  653. # SYSCALL
  654. -a always,exit -F arch=b64 -S kill,tkill,tgkill -F a1=9 -F key=trace_kill_9
  655. -a always,exit -F arch=b64 -S kill,tkill,tgkill -F a1=15 -F key=trace_kill_15
  656. # docker
  657. -w /usr/bin/dockerd -k docker
  658. -w /var/lib/docker -k docker
  659. -w /etc/docker -k docker
  660. -w /usr/lib/systemd/system/docker.service -k docker
  661. -w /etc/systemd/system/docker.service -k docker
  662. -w /usr/lib/systemd/system/docker.socket -k docker
  663. -w /etc/default/docker -k docker
  664. -w /etc/sysconfig/docker -k docker
  665. -w /etc/docker/daemon.json -k docker
  666. # containerd
  667. -w /usr/bin/containerd -k containerd
  668. -w /var/lib/containerd -k containerd
  669. -w /usr/lib/systemd/system/containerd.service -k containerd
  670. -w /etc/containerd/config.toml -k containerd
  671. # cri-o
  672. -w /usr/bin/crio -k cri-o
  673. -w /etc/crio -k cri-o
  674. # runc
  675. -w /usr/bin/runc -k runc
  676. # kube
  677. -w /usr/bin/kubeadm -k kubeadm
  678. -w /usr/bin/kubelet -k kubelet
  679. -w /usr/bin/kubectl -k kubectl
  680. -w /var/lib/kubelet -k kubelet
  681. -w /etc/kubernetes -k kubernetes
  682. ## Kainstall managed end
  683. EOF
  684. chmod 600 /etc/audit/rules.d/audit.rules
  685. sed -i 's#max_log_file =.*#max_log_file = 80#g' /etc/audit/auditd.conf
  686. if [ -f /usr/libexec/initscripts/legacy-actions/auditd/restart ]; then
  687. /usr/libexec/initscripts/legacy-actions/auditd/restart
  688. else
  689. systemctl stop auditd && systemctl start auditd
  690. fi
  691. systemctl enable auditd
  692. grep single-request-reopen /etc/resolv.conf || sed -i '1ioptions timeout:2 attempts:3 rotate single-request-reopen' /etc/resolv.conf
  693. ipvsadm --clear
  694. iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
  695. }
  696. # 升级内核
  697. function script::upgrade_kernel() {
  698. local ver; ver=$(rpm --eval "%{centos_ver}")
  699. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y "https://www.elrepo.org/elrepo-release-${ver}.el${ver}.elrepo.noarch.rpm"
  700. sed -e "s/^mirrorlist=/#mirrorlist=/g" \
  701. -e "s/elrepo.org\/linux/mirrors.tuna.tsinghua.edu.cn\/elrepo/g" \
  702. -i /etc/yum.repos.d/elrepo.repo
  703. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y --disablerepo="*" --enablerepo=elrepo-kernel kernel-lt{,-devel}
  704. grub2-set-default 0 && grub2-mkconfig -o /etc/grub2.cfg
  705. grubby --default-kernel
  706. grubby --args="user_namespace.enable=1" --update-kernel="$(grubby --default-kernel)"
  707. }
  708. # 节点软件升级
  709. function script::upgrage_kube() {
  710. local role=${1:-init}
  711. local version="-${2:-latest}"
  712. version="${version#-latest}"
  713. set -e
  714. echo '[install] kubeadm'
  715. kubeadm version
  716. yum install -y "kubeadm${version}" --disableexcludes=kubernetes
  717. kubeadm version
  718. echo '[upgrade]'
  719. if [[ "$role" == "init" ]]; then
  720. local plan_info; plan_info=$(kubeadm upgrade plan)
  721. local v; v=$(printf "%s" "$plan_info" | grep 'kubeadm upgrade apply ' | awk '{print $4}'| tail -1 )
  722. printf "%s\n" "${plan_info}"
  723. kubeadm upgrade apply "${v}" -y
  724. else
  725. kubeadm upgrade node
  726. fi
  727. echo '[install] kubelet kubectl'
  728. kubectl version --client=true
  729. yum install -y "kubelet${version}" "kubectl${version}" --disableexcludes=kubernetes
  730. kubectl version --client=true
  731. [ -f /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf ] && \
  732. sed -i 's#^\[Service\]#[Service]\nCPUAccounting=true\nMemoryAccounting=true#g' /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf
  733. systemctl daemon-reload
  734. systemctl restart kubelet
  735. }
  736. # 安装 docker
  737. function script::install_docker() {
  738. local version="-${1:-latest}"
  739. version="${version#-latest}"
  740. cat << EOF > /etc/yum.repos.d/docker-ce.repo
  741. [docker-ce-stable]
  742. name=Docker CE Stable - \$basearch
  743. baseurl=https://mirrors.aliyun.com/docker-ce/linux/centos/$(rpm --eval '%{centos_ver}')/\$basearch/stable
  744. enabled=1
  745. gpgcheck=1
  746. gpgkey=https://mirrors.aliyun.com/docker-ce/linux/centos/gpg
  747. EOF
  748. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  749. [ -f "$(which docker)" ] && yum remove -y docker-ce docker-ce-cli containerd.io
  750. yum install -y "docker-ce${version}" "docker-ce-cli${version}" containerd.io bash-completion
  751. fi
  752. [ -f /usr/share/bash-completion/completions/docker ] && \
  753. cp -f /usr/share/bash-completion/completions/docker /etc/bash_completion.d/
  754. [ ! -d /etc/docker ] && mkdir /etc/docker
  755. # /etc/docker/daemon.json
  756. cat << EOF > /etc/docker/daemon.json
  757. {
  758. "data-root": "/var/lib/docker",
  759. "log-driver": "json-file",
  760. "log-opts": {
  761. "max-size": "100m",
  762. "max-file": "3"
  763. },
  764. "default-ulimits": {
  765. "nofile": {
  766. "Name": "nofile",
  767. "Hard": 655360,
  768. "Soft": 655360
  769. },
  770. "nproc": {
  771. "Name": "nproc",
  772. "Hard": 655360,
  773. "Soft": 655360
  774. }
  775. },
  776. "live-restore": true,
  777. "oom-score-adjust": -1000,
  778. "max-concurrent-downloads": 10,
  779. "max-concurrent-uploads": 10,
  780. "storage-driver": "overlay2",
  781. "storage-opts": ["overlay2.override_kernel_check=true"],
  782. "exec-opts": ["native.cgroupdriver=systemd"],
  783. "registry-mirrors": [
  784. "https://573d5l8e.mirror.aliyuncs.com"
  785. ]
  786. }
  787. EOF
  788. sed -i 's|#oom_score = 0|oom_score = -999|' /etc/containerd/config.toml
  789. # /etc/crictl.yaml
  790. cat << EOF > /etc/crictl.yaml
  791. runtime-endpoint: unix:///var/run/dockershim.sock
  792. image-endpoint: unix:///var/run/dockershim.sock
  793. timeout: 2
  794. debug: false
  795. pull-image-on-create: true
  796. disable-pull-on-run: false
  797. EOF
  798. systemctl enable containerd
  799. systemctl restart containerd
  800. systemctl enable docker
  801. systemctl restart docker
  802. }
  803. # 安装 containerd
  804. function script::install_containerd() {
  805. local version="-${1:-latest}"
  806. version="${version#-latest}"
  807. # /etc/yum.repos.d/docker-ce.repo
  808. cat << EOF > /etc/yum.repos.d/docker-ce.repo
  809. [docker-ce-stable]
  810. name=Docker CE Stable - \$basearch
  811. baseurl=https://mirrors.aliyun.com/docker-ce/linux/centos/$(rpm --eval '%{centos_ver}')/\$basearch/stable
  812. enabled=1
  813. gpgcheck=1
  814. gpgkey=https://mirrors.aliyun.com/docker-ce/linux/centos/gpg
  815. EOF
  816. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  817. [ -f "$(which runc)" ] && yum remove -y runc
  818. [ -f "$(which containerd)" ] && yum remove -y containerd.io
  819. yum install -y containerd.io"${version}" containernetworking bash-completion
  820. fi
  821. [ -d /etc/bash_completion.d ] && crictl completion bash > /etc/bash_completion.d/crictl
  822. containerd config default > /etc/containerd/config.toml
  823. sed -i -e "s#k8s.gcr.io#registry.cn-hangzhou.aliyuncs.com/kainstall#g" \
  824. -e "s#https://registry-1.docker.io#https://573d5l8e.mirror.aliyuncs.com#g" \
  825. -e "s#SystemdCgroup = false#SystemdCgroup = true#g" \
  826. -e "s#oom_score = 0#oom_score = -999#" \
  827. -e "s#max_concurrent_downloads = 3#max_concurrent_downloads = 10#g" /etc/containerd/config.toml
  828. grep docker.io /etc/containerd/config.toml || sed -i -e "/registry.mirrors]/a\ \ \ \ \ \ \ \ [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"https://573d5l8e.mirror.aliyuncs.com\"]" \
  829. /etc/containerd/config.toml
  830. # /etc/crictl.yaml
  831. cat << EOF > /etc/crictl.yaml
  832. runtime-endpoint: unix:///run/containerd/containerd.sock
  833. image-endpoint: unix:///run/containerd/containerd.sock
  834. timeout: 2
  835. debug: false
  836. pull-image-on-create: true
  837. disable-pull-on-run: false
  838. EOF
  839. systemctl restart containerd
  840. systemctl enable containerd
  841. }
  842. # 安装 cri-o
  843. function script::install_cri-o() {
  844. local version="${1:-latest}"
  845. version="${version#-latest}"
  846. os="CentOS_$(rpm --eval '%{centos_ver}')" && echo "${os}"
  847. # /etc/yum.repos.d/devel_kubic_libcontainers_stable.repo
  848. cat << EOF > /etc/yum.repos.d/devel_kubic_libcontainers_stable.repo
  849. [devel_kubic_libcontainers_stable]
  850. name=Stable Releases of Upstream github.com/containers packages
  851. type=rpm-md
  852. baseurl=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/${os}/
  853. gpgcheck=1
  854. gpgkey=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/${os}/repodata/repomd.xml.key
  855. enabled=1
  856. [devel_kubic_libcontainers_stable_cri-o]
  857. name=devel:kubic:libcontainers:stable:cri-o
  858. type=rpm-md
  859. baseurl=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/${version}/${os}/
  860. gpgcheck=1
  861. gpgkey=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/${version}/${os}/repodata/repomd.xml.key
  862. enabled=1
  863. EOF
  864. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  865. [ -f "$(which runc)" ] && yum remove -y runc
  866. [ -f "$(which crio)" ] && yum remove -y cri-o
  867. [ -f "$(which docker)" ] && yum remove -y docker-ce docker-ce-cli containerd.io
  868. yum install -y runc cri-o bash-completion --disablerepo=docker-ce-stable || yum install -y runc cri-o bash-completion
  869. fi
  870. [ -d /etc/bash_completion.d ] && \
  871. { crictl completion bash > /etc/bash_completion.d/crictl; \
  872. crio completion bash > /etc/bash_completion.d/crio; \
  873. crio-status completion bash > /etc/bash_completion.d/crio-status; }
  874. [ ! -f /etc/crio/crio.conf ] && crio config --default > /etc/crio/crio.conf
  875. sed -i -e "s#k8s.gcr.io#registry.cn-hangzhou.aliyuncs.com/kainstall#g" \
  876. -e 's|#registries = \[|registries = ["docker.io", "quay.io"]|g' /etc/crio/crio.conf
  877. # /etc/crio/crio.conf
  878. cat << EOF >> /etc/crio/crio.conf
  879. [crio.image]
  880. pause_image = "registry.cn-hangzhou.aliyuncs.com/kainstall/pause:3.6"
  881. EOF
  882. # /etc/containers/registries.conf.d/000-dockerio.conf
  883. [ -d /etc/containers/registries.conf.d ] && cat << EOF > /etc/containers/registries.conf.d/000-dockerio.conf
  884. [[registry]]
  885. prefix = "docker.io"
  886. insecure = false
  887. blocked = false
  888. location = "docker.io"
  889. [[registry.mirror]]
  890. location = "573d5l8e.mirror.aliyuncs.com"
  891. insecure = true
  892. EOF
  893. # /etc/crictl.yaml
  894. cat << EOF > /etc/crictl.yaml
  895. runtime-endpoint: unix:///var/run/crio/crio.sock
  896. image-endpoint: unix:///var/run/crio/crio.sock
  897. timeout: 2
  898. debug: false
  899. pull-image-on-create: true
  900. disable-pull-on-run: false
  901. EOF
  902. # /etc/cni/net.d/100-crio-bridge.conf
  903. sed -i "s#10.85.0.0/16#${KUBE_POD_SUBNET:-10.85.0.0/16}#g" /etc/cni/net.d/100-crio-bridge.conf
  904. # /etc/cni/net.d/10-crio.conf
  905. cat << EOF > /etc/cni/net.d/10-crio.conf
  906. {
  907. $(grep cniVersion /etc/cni/net.d/100-crio-bridge.conf)
  908. "name": "crio",
  909. "type": "flannel"
  910. }
  911. EOF
  912. mv /etc/cni/net.d/100-crio-bridge.conf /etc/cni/net.d/10-crio.conf /etc/cni/net.d/200-loopback.conf /tmp/
  913. systemctl restart crio
  914. systemctl enable crio
  915. }
  916. # 安装kube组件
  917. function script::install_kube() {
  918. local version="-${1:-latest}"
  919. version="${version#-latest}"
  920. # /etc/yum.repos.d/kubernetes.repo
  921. cat <<EOF > /etc/yum.repos.d/kubernetes.repo
  922. [kubernetes]
  923. name=Kubernetes
  924. baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
  925. enabled=1
  926. gpgcheck=0
  927. repo_gpgcheck=0
  928. gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
  929. EOF
  930. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  931. [ -f /usr/bin/kubeadm ] && yum remove -y kubeadm
  932. [ -f /usr/bin/kubelet ] && yum remove -y kubelet
  933. [ -f /usr/bin/kubectl ] && yum remove -y kubectl
  934. yum install -y "kubeadm${version}" "kubelet${version}" "kubectl${version}" --disableexcludes=kubernetes
  935. fi
  936. [ -d /etc/bash_completion.d ] && \
  937. { kubectl completion bash > /etc/bash_completion.d/kubectl; \
  938. kubeadm completion bash > /etc/bash_completion.d/kubadm; }
  939. [ ! -d /usr/lib/systemd/system/kubelet.service.d ] && mkdir -p /usr/lib/systemd/system/kubelet.service.d
  940. cat << EOF > /usr/lib/systemd/system/kubelet.service.d/11-cgroup.conf
  941. [Service]
  942. CPUAccounting=true
  943. MemoryAccounting=true
  944. BlockIOAccounting=true
  945. ExecStartPre=/bin/bash -c '/bin/mkdir -p /sys/fs/cgroup/{cpuset,memory,hugetlb,systemd,pids,"cpu,cpuacct"}/{system,kube,kubepods}.slice||:'
  946. Slice=kube.slice
  947. EOF
  948. systemctl daemon-reload
  949. systemctl enable kubelet
  950. systemctl restart kubelet
  951. }
  952. # 安装haproxy
  953. function script::install_haproxy() {
  954. local api_servers="$*"
  955. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  956. [ -f /usr/bin/haproxy ] && yum remove -y haproxy
  957. yum install -y haproxy
  958. fi
  959. # /etc/haproxy/haproxy.cfg
  960. [ ! -f /etc/haproxy/haproxy.cfg_bak ] && cp /etc/haproxy/haproxy.cfg{,_bak}
  961. cat << EOF > /etc/haproxy/haproxy.cfg
  962. global
  963. log /dev/log local0
  964. log /dev/log local1 notice
  965. tune.ssl.default-dh-param 2048
  966. defaults
  967. log global
  968. mode http
  969. option dontlognull
  970. timeout connect 5000ms
  971. timeout client 600000ms
  972. timeout server 600000ms
  973. listen stats
  974. bind :19090
  975. mode http
  976. balance
  977. stats uri /haproxy_stats
  978. stats auth admin:admin123
  979. stats admin if TRUE
  980. frontend kube-apiserver-https
  981. mode tcp
  982. option tcplog
  983. bind :6443
  984. default_backend kube-apiserver-backend
  985. backend kube-apiserver-backend
  986. mode tcp
  987. balance roundrobin
  988. stick-table type ip size 200k expire 30m
  989. stick on src
  990. $(index=1;for h in $api_servers;do echo " server apiserver${index} $h:6443 check";index=$((index+1));done)
  991. EOF
  992. systemctl enable haproxy
  993. systemctl restart haproxy
  994. }
  995. # 安装helm
  996. function script::install_helm() {
  997. local version="${1:-3.10.1}"
  998. version="${version#-3.10.1}"
  999. local path="/tmp"
  1000. cd $path
  1001. # 下载软件(国内源)
  1002. wget https://mirrors.huaweicloud.com/helm/v$version/helm-v$version-linux-amd64.tar.gz
  1003. # 解压
  1004. tar -zxvf helm-v$version-linux-amd64.tar.gz
  1005. # 安装
  1006. sudo mv linux-amd64/helm /usr/local/bin/
  1007. # 清理
  1008. rm -rf helm-v$version-linux-amd64.tar.gz linux-amd64
  1009. # 验证
  1010. helm version
  1011. cd ~
  1012. }
  1013. # 检查用到的命令
  1014. function check::command() {
  1015. check::command_exists ssh openssh-clients
  1016. check::command_exists sshpass sshpass
  1017. check::command_exists wget wget
  1018. [[ "${OFFLINE_TAG:-}" == "1" ]] && check::command_exists tar tar
  1019. }
  1020. # 检查ssh连通性
  1021. function check::ssh_conn() {
  1022. for host in $MASTER_NODES $WORKER_NODES
  1023. do
  1024. [ "$host" == "127.0.0.1" ] && continue
  1025. command::exec "${host}" "echo 0"
  1026. check::exit_code "$?" "check" "ssh $host connection" "exit"
  1027. done
  1028. }
  1029. # 检查os系统支持
  1030. function check::os() {
  1031. log::info "[check]" "os support: ${OS_SUPPORT}"
  1032. for host in $MASTER_NODES $WORKER_NODES
  1033. do
  1034. command::exec "${host}" "
  1035. [ -f /etc/os-release ] && source /etc/os-release
  1036. echo client_os:\${ID:-}\${VERSION_ID:-}
  1037. if [[ \"${OS_SUPPORT}\" == *\"\${ID:-}\${VERSION_ID:-}\"* ]]; then
  1038. exit 0
  1039. fi
  1040. exit 1
  1041. "
  1042. check::exit_code "$?" "check" "$host os support" "exit"
  1043. done
  1044. }
  1045. # 检查os kernel 版本
  1046. function check::kernel() {
  1047. local version=${1:-}
  1048. log::info "[check]" "kernel version not less than ${version}"
  1049. version=$(echo "${version}" | awk -F. '{ printf("%d%03d%03d\n", $1,$2,$3); }')
  1050. for host in $MASTER_NODES $WORKER_NODES
  1051. do
  1052. command::exec "${host}" "
  1053. kernel_version=\$(uname -r)
  1054. kernel_version=\$(echo \${kernel_version/-*} | awk -F. '{ printf(\"%d%03d%03d\n\", \$1,\$2,\$3); }')
  1055. echo kernel_version \${kernel_version}
  1056. [[ \${kernel_version} -ge ${version} ]] && exit 0 || exit 1
  1057. "
  1058. check::exit_code "$?" "check" "$host kernel version" "exit"
  1059. done
  1060. }
  1061. # 检查api-server连通性
  1062. function check::apiserver_conn() {
  1063. command::exec "${MGMT_NODE}" "kubectl get node"
  1064. check::exit_code "$?" "check" "conn apiserver" "exit"
  1065. }
  1066. # 检查返回码
  1067. function check::exit_code() {
  1068. local code=${1:-}
  1069. local app=${2:-}
  1070. local desc=${3:-}
  1071. local exit_script=${4:-}
  1072. if [[ "${code}" == "0" ]]; then
  1073. log::info "[${app}]" "${desc} succeeded."
  1074. else
  1075. log::error "[${app}]" "${desc} failed."
  1076. [[ "$exit_script" == "exit" ]] && exit "$code"
  1077. fi
  1078. }
  1079. # 预检
  1080. function check::preflight() {
  1081. # check command
  1082. check::command
  1083. # check ssh conn
  1084. check::ssh_conn
  1085. # check os
  1086. check::os
  1087. # check os kernel
  1088. [[ "${KUBE_NETWORK:-}" == "cilium" ]] && check::kernel 4.9.17
  1089. # check api-server conn
  1090. if [[ $(( ${ADD_TAG:-0} + ${DEL_TAG:-0} + ${UPGRADE_TAG:-0} + ${RENEW_CERT_TAG:-0} )) -gt 0 ]]; then
  1091. check::apiserver_conn
  1092. fi
  1093. }
  1094. # 安装包
  1095. function install::package() {
  1096. # 检查k8s最新稳定版本
  1097. if [[ "${KUBE_CRI}" == "cri-o" && "${KUBE_CRI_VERSION}" == "latest" ]]; then
  1098. KUBE_CRI_VERSION="${KUBE_VERSION}"
  1099. if [[ "${KUBE_CRI_VERSION}" == "latest" ]]; then
  1100. if command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"; then
  1101. KUBE_CRI_VERSION="${COMMAND_OUTPUT#v}"
  1102. else
  1103. log::error "[install]" "get kubernetes stable version error. Please specify the version!"
  1104. exit 1
  1105. fi
  1106. fi
  1107. KUBE_CRI_VERSION="${KUBE_CRI_VERSION%.*}"
  1108. fi
  1109. # 安装 cri kube
  1110. for host in $MASTER_NODES $WORKER_NODES
  1111. do
  1112. # install cri
  1113. log::info "[install]" "install ${KUBE_CRI} on $host."
  1114. command::exec "${host}" "
  1115. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1116. $(declare -f script::install_"${KUBE_CRI}")
  1117. script::install_${KUBE_CRI} $KUBE_CRI_VERSION
  1118. "
  1119. check::exit_code "$?" "install" "install ${KUBE_CRI} on $host"
  1120. # install kube
  1121. log::info "[install]" "install kube on $host"
  1122. command::exec "${host}" "
  1123. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1124. $(declare -f script::install_kube)
  1125. script::install_kube $KUBE_VERSION
  1126. "
  1127. check::exit_code "$?" "install" "install kube on $host"
  1128. done
  1129. # 配置 kube
  1130. local apiservers=$MASTER_NODES
  1131. if [[ "$apiservers" == "127.0.0.1" ]]; then
  1132. command::exec "${MGMT_NODE}" "ip -o route get to 8.8.8.8 | sed -n 's/.*src \([0-9.]\+\).*/\1/p'"
  1133. get::command_output "apiservers" "$?"
  1134. fi
  1135. # 输出 api-servers 信息
  1136. if [[ "${ADD_TAG:-}" == "1" ]]; then
  1137. command::exec "${MGMT_NODE}" "
  1138. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1139. "
  1140. get::command_output "apiservers" "$?"
  1141. fi
  1142. # 安装 haproxy
  1143. for host in $WORKER_NODES
  1144. do
  1145. # install haproxy
  1146. log::info "[install]" "install haproxy on $host"
  1147. command::exec "${host}" "
  1148. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1149. $(declare -f script::install_haproxy)
  1150. script::install_haproxy \"$apiservers\"
  1151. "
  1152. check::exit_code "$?" "install" "install haproxy on $host"
  1153. done
  1154. # 10年证书
  1155. if [[ "${CERT_YEAR_TAG:-}" == "1" ]]; then
  1156. local version="${KUBE_VERSION}"
  1157. if [[ "${version}" == "latest" ]]; then
  1158. if command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"; then
  1159. version="${COMMAND_OUTPUT#v}"
  1160. else
  1161. log::error "[install]" "get kubernetes stable version error. Please specify the version!"
  1162. exit 1
  1163. fi
  1164. fi
  1165. log::info "[install]" "download kubeadm 10 years certs client"
  1166. local certs_file="${OFFLINE_DIR}/bins/kubeadm-linux-amd64"
  1167. MGMT_NODE="127.0.0.1" utils::download_file "${GITHUB_PROXY}https://github.com/lework/kubeadm-certs/releases/download/v${version}/kubeadm-linux-amd64" "${certs_file}"
  1168. for host in $MASTER_NODES $WORKER_NODES
  1169. do
  1170. log::info "[install]" "scp kubeadm client to $host"
  1171. command::scp "${host}" "${certs_file}" "/tmp/kubeadm-linux-amd64"
  1172. check::exit_code "$?" "install" "scp kubeadm client to $host" "exit"
  1173. command::exec "${host}" "
  1174. set -e
  1175. if [[ -f /tmp/kubeadm-linux-amd64 ]]; then
  1176. [[ -f /usr/bin/kubeadm && ! -f /usr/bin/kubeadm_src ]] && mv -fv /usr/bin/kubeadm{,_src}
  1177. mv -fv /tmp/kubeadm-linux-amd64 /usr/bin/kubeadm
  1178. chmod +x /usr/bin/kubeadm
  1179. else
  1180. echo \"not found /tmp/kubeadm-linux-amd64\"
  1181. exit 1
  1182. fi
  1183. "
  1184. check::exit_code "$?" "install" "$host: use kubeadm 10 years certs client"
  1185. done
  1186. fi
  1187. # 安装helm
  1188. for host in $MASTER_NODES $WORKER_NODES
  1189. do
  1190. log::info "[install]" "install helm on $host"
  1191. command::exec "${host}" "
  1192. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1193. $(declare -f script::install_helm)
  1194. script::install_helm $HELM_VERSION
  1195. "
  1196. check::exit_code "$?" "install" "install helm on $host"
  1197. done
  1198. }
  1199. # 升级节点内核
  1200. function init::upgrade_kernel() {
  1201. [[ "${UPGRADE_KERNEL_TAG:-}" != "1" ]] && return
  1202. for host in $MASTER_NODES $WORKER_NODES
  1203. do
  1204. log::info "[init]" "upgrade kernel: $host"
  1205. command::exec "${host}" "
  1206. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1207. $(declare -f script::upgrade_kernel)
  1208. script::upgrade_kernel
  1209. "
  1210. check::exit_code "$?" "init" "upgrade kernel $host" "exit"
  1211. done
  1212. for host in $MASTER_NODES $WORKER_NODES
  1213. do
  1214. command::exec "${host}" "bash -c 'sleep 15 && reboot' &>/dev/null &"
  1215. check::exit_code "$?" "init" "$host: Wait for 15s to restart"
  1216. done
  1217. log::info "[notice]" "Please execute the command again!"
  1218. log::access "[command]" "bash $0 ${SCRIPT_PARAMETER// --upgrade-kernel/}"
  1219. exit 0
  1220. }
  1221. # 节点证书续期
  1222. function cert::renew_node() {
  1223. local role="${1:-master}"
  1224. local hosts=""
  1225. local kubelet_config=""
  1226. command::exec "${MGMT_NODE}" "
  1227. kubectl get node --selector='node-role.kubernetes.io/${role}' -o jsonpath='{range.items[*]}{.metadata.name } {end}'
  1228. "
  1229. get::command_output "hosts" "$?"
  1230. for host in ${hosts}
  1231. do
  1232. log::info "[cert]" "drain $host"
  1233. command::exec "${MGMT_NODE}" "kubectl drain $host --force --ignore-daemonsets --delete-local-data"
  1234. check::exit_code "$?" "cert" "$host: drain"
  1235. sleep 5
  1236. if [[ "${role}" == "master" ]]; then
  1237. command::exec "${host}" "cp -rf /etc/kubernetes /etc/kubernetes_\$(date +%Y-%m-%d)"
  1238. check::exit_code "$?" "cert" "$host: backup kubernetes config"
  1239. command::exec "${host}" "kubeadm certs renew all 2>/dev/null|| kubeadm alpha certs renew all"
  1240. check::exit_code "$?" "cert" "$host: renew certs"
  1241. command::exec "${host}" "
  1242. $(declare -f utils::retry)
  1243. kill -s SIGHUP \$(pidof etcd) && \
  1244. utils::retry 10 \"echo -n | openssl s_client -connect localhost:2379 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1245. "
  1246. check::exit_code "$?" "cert" "$host: restart etcd"
  1247. command::exec "${host}" "
  1248. $(declare -f utils::retry)
  1249. kill -s SIGHUP \$(pidof kube-apiserver) && \
  1250. utils::retry 10 \"echo -n | openssl s_client -connect localhost:6443 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1251. "
  1252. check::exit_code "$?" "cert" "$host: restart kube-apiserver"
  1253. command::exec "${host}" "
  1254. $(declare -f utils::retry)
  1255. kill -s SIGHUP \$(pidof kube-controller-manager) && \
  1256. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10257 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1257. "
  1258. check::exit_code "$?" "cert" "$host: restart kube-controller-manager"
  1259. command::exec "${host}" "
  1260. $(declare -f utils::retry)
  1261. kill -s SIGHUP \$(pidof kube-scheduler) && \
  1262. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10259 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1263. "
  1264. check::exit_code "$?" "cert" "$host: restart kube-scheduler"
  1265. fi
  1266. log::info "[cert]" "get kubelet config"
  1267. command::exec "${MGMT_NODE}" "
  1268. kubeadm kubeconfig user --org system:nodes --client-name system:node:${host} --config /etc/kubernetes/kubeadmcfg.yaml || kubeadm alpha kubeconfig user --org system:nodes --client-name system:node:${host} --config /etc/kubernetes/kubeadmcfg.yaml
  1269. "
  1270. get::command_output "kubelet_config" "$?" "exit"
  1271. if [[ "$kubelet_config" != "" ]]; then
  1272. log::info "[cert]" "copy kubelet config"
  1273. command::exec "${host}" "
  1274. cp /etc/kubernetes/kubelet.conf /etc/kubernetes/kubelet.conf_bak
  1275. echo '$(printf "%s" "${kubelet_config}" | sed 's#https://.*:#https://127.0.0.1:#g')' > /etc/kubernetes/kubelet.conf
  1276. "
  1277. check::exit_code "$?" "cert" "$host: copy kubelet config"
  1278. command::exec "${host}" "rm -rfv /var/lib/kubelet/pki/*"
  1279. check::exit_code "$?" "cert" "$host: delete kubelet pki files"
  1280. command::exec "${host}" "
  1281. $(declare -f utils::retry)
  1282. systemctl restart kubelet && \
  1283. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10250 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1284. "
  1285. local status="$?"
  1286. check::exit_code "${status}" "cert" "$host: restart kubelet"
  1287. if [[ "${status}" == "0" ]]; then
  1288. sleep 5
  1289. command::exec "${MGMT_NODE}" "kubectl uncordon ${host}"
  1290. check::exit_code "$?" "cert" "uncordon ${host} node"
  1291. fi
  1292. fi
  1293. done
  1294. }
  1295. # 证书续期
  1296. function cert::renew() {
  1297. log::info "[cert]" "renew cluster cert"
  1298. cert::renew_node "master"
  1299. cert::renew_node "worker"
  1300. log::info "[cert]" "cluster status"
  1301. command::exec "${MGMT_NODE}" "
  1302. echo
  1303. kubectl get node
  1304. echo
  1305. kubeadm certs check-expiration 2>/dev/null || kubeadm alpha certs check-expiration
  1306. " && printf "%s" "${COMMAND_OUTPUT}"
  1307. }
  1308. # 初始化节点配置
  1309. function init::node_config() {
  1310. local master_index=${master_index:-1}
  1311. local worker_index=${worker_index:-1}
  1312. log::info "[init]" "Get $MGMT_NODE InternalIP."
  1313. command::exec "${MGMT_NODE}" "
  1314. ip -4 route get 8.8.8.8 2>/dev/null | head -1 | awk '{print \$7}'
  1315. "
  1316. get::command_output "MGMT_NODE_IP" "$?" "exit"
  1317. # master
  1318. for host in $MASTER_NODES
  1319. do
  1320. log::info "[init]" "master: $host"
  1321. command::exec "${host}" "
  1322. export OFFLINE_TAG=${OFFLINE_TAG:-0} KUBE_APISERVER=${KUBE_APISERVER} SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  1323. $(declare -f script::init_node)
  1324. script::init_node
  1325. "
  1326. check::exit_code "$?" "init" "init master $host" "exit"
  1327. # 设置主机名和解析
  1328. command::exec "${host}" "
  1329. printf \"\\n${MGMT_NODE_IP} $KUBE_APISERVER\\n$node_hosts\" >> /etc/hosts
  1330. hostnamectl set-hostname ${HOSTNAME_PREFIX}-master-node${master_index}
  1331. "
  1332. check::exit_code "$?" "init" "$host set hostname and hostname resolution"
  1333. # set audit-policy
  1334. log::info "[init]" "$host: set audit-policy file."
  1335. command::exec "${host}" "
  1336. [ ! -d etc/kubernetes ] && mkdir -p /etc/kubernetes
  1337. cat << EOF > /etc/kubernetes/audit-policy.yaml
  1338. # Log all requests at the Metadata level.
  1339. apiVersion: audit.k8s.io/v1
  1340. kind: Policy
  1341. rules:
  1342. - level: Metadata
  1343. EOF
  1344. "
  1345. check::exit_code "$?" "init" "$host: set audit-policy file" "exit"
  1346. master_index=$((master_index + 1))
  1347. done
  1348. # worker
  1349. for host in $WORKER_NODES
  1350. do
  1351. log::info "[init]" "worker: $host"
  1352. command::exec "${host}" "
  1353. export OFFLINE_TAG=${OFFLINE_TAG:-0} KUBE_APISERVER=${KUBE_APISERVER} SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  1354. $(declare -f script::init_node)
  1355. script::init_node
  1356. "
  1357. check::exit_code "$?" "init" "init worker $host" "exit"
  1358. # 设置主机名和解析
  1359. command::exec "${host}" "
  1360. printf \"\\n127.0.0.1 $KUBE_APISERVER\\n$node_hosts\" >> /etc/hosts
  1361. hostnamectl set-hostname ${HOSTNAME_PREFIX}-worker-node${worker_index}
  1362. "
  1363. worker_index=$((worker_index + 1))
  1364. done
  1365. }
  1366. # 初始化节点
  1367. function init::node() {
  1368. init::upgrade_kernel
  1369. local node_hosts=""
  1370. local i=1
  1371. for h in $MASTER_NODES
  1372. do
  1373. node_hosts="${node_hosts}\n$h ${HOSTNAME_PREFIX}-master-node${i}"
  1374. i=$((i + 1))
  1375. done
  1376. local i=1
  1377. for h in $WORKER_NODES
  1378. do
  1379. node_hosts="${node_hosts}\n$h ${HOSTNAME_PREFIX}-worker-node${i}"
  1380. i=$((i + 1))
  1381. done
  1382. init::node_config
  1383. }
  1384. # 初始化添加的节点
  1385. function init::add_node() {
  1386. init::upgrade_kernel
  1387. local master_index=0
  1388. local worker_index=0
  1389. local node_hosts=""
  1390. local add_node_hosts=""
  1391. command::exec "${MGMT_NODE}" "
  1392. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address } {end}' | awk '{print \$1}'
  1393. "
  1394. get::command_output "MGMT_NODE" "$?" "exit"
  1395. # 获取现有集群节点主机名
  1396. command::exec "${MGMT_NODE}" "
  1397. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {.metadata.name }\\n{end}'
  1398. "
  1399. get::command_output "node_hosts" "$?" "exit"
  1400. for host in $MASTER_NODES $WORKER_NODES
  1401. do
  1402. if [[ $node_hosts == *"$host"* ]]; then
  1403. log::error "[init]" "The host $host is already in the cluster!"
  1404. exit 1
  1405. fi
  1406. done
  1407. if [[ "$MASTER_NODES" != "" ]]; then
  1408. command::exec "${MGMT_NODE}" "
  1409. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].metadata.name}' |grep -Eo 'node[0-9]*'|grep -Eo '[0-9]*'|awk -F ' ' 'BEGIN {max = 0} {if (\$0+0 > max+0) max=\$0} END {print max}'
  1410. "
  1411. get::command_output "master_index" "$?" "exit"
  1412. master_index=$(( master_index + 1 ))
  1413. local i=$master_index
  1414. for host in $MASTER_NODES
  1415. do
  1416. add_node_hosts="${add_node_hosts}\n${host:-} ${HOSTNAME_PREFIX}-master-node${i}"
  1417. i=$((i + 1))
  1418. done
  1419. fi
  1420. if [[ "$WORKER_NODES" != "" ]]; then
  1421. command::exec "${MGMT_NODE}" "
  1422. kubectl get node --selector='node-role.kubernetes.io/worker' -o jsonpath='{\$.items[*].metadata.name}'| grep -Eo 'node[0-9]*'|grep -Eo '[0-9]*'|awk 'BEGIN {max = 0} {if (\$0+0 > max+0) max=\$0} END {print max}' || echo 0
  1423. "
  1424. get::command_output "worker_index" "$?" "exit"
  1425. worker_index=$(( worker_index + 1 ))
  1426. local i=$worker_index
  1427. for host in $WORKER_NODES
  1428. do
  1429. add_node_hosts="${add_node_hosts}\n${host:-} ${HOSTNAME_PREFIX}-worker-node${i}"
  1430. i=$((i + 1))
  1431. done
  1432. fi
  1433. # 向集群节点添加新增的节点主机名解析
  1434. for host in $(echo -ne "$node_hosts" | awk '{print $1}')
  1435. do
  1436. command::exec "${host}" "
  1437. printf \"$add_node_hosts\" >> /etc/hosts
  1438. "
  1439. check::exit_code "$?" "init" "$host add new node hostname resolution"
  1440. done
  1441. node_hosts="${node_hosts}\n${add_node_hosts}"
  1442. init::node_config
  1443. }
  1444. # 集群初始化
  1445. function kubeadm::init() {
  1446. log::info "[kubeadm init]" "kubeadm init on ${MGMT_NODE}"
  1447. log::info "[kubeadm init]" "${MGMT_NODE}: set kubeadmcfg.yaml"
  1448. command::exec "${MGMT_NODE}" "
  1449. PAUSE_VERSION=$(kubeadm config images list 2>/dev/null | awk -F: '/pause/ {print $2}')
  1450. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1451. ---
  1452. apiVersion: kubeadm.k8s.io/v1beta2
  1453. kind: InitConfiguration
  1454. ${kubelet_nodeRegistration}
  1455. ---
  1456. apiVersion: kubeproxy.config.k8s.io/v1alpha1
  1457. kind: KubeProxyConfiguration
  1458. mode: ipvs
  1459. ipvs:
  1460. minSyncPeriod: 5s
  1461. syncPeriod: 5s
  1462. # ipvs 负载策略
  1463. scheduler: 'wrr'
  1464. ---
  1465. apiVersion: kubelet.config.k8s.io/v1beta1
  1466. kind: KubeletConfiguration
  1467. maxPods: 200
  1468. cgroupDriver: systemd
  1469. runtimeRequestTimeout: 5m
  1470. # 此配置保证了 kubelet 能在 swap 开启的情况下启动
  1471. failSwapOn: false
  1472. nodeStatusUpdateFrequency: 5s
  1473. rotateCertificates: true
  1474. imageGCLowThresholdPercent: 70
  1475. imageGCHighThresholdPercent: 80
  1476. # 软驱逐阀值
  1477. evictionSoft:
  1478. imagefs.available: 15%
  1479. memory.available: 512Mi
  1480. nodefs.available: 15%
  1481. nodefs.inodesFree: 10%
  1482. # 达到软阈值之后,持续时间超过多久才进行驱逐
  1483. evictionSoftGracePeriod:
  1484. imagefs.available: 3m
  1485. memory.available: 1m
  1486. nodefs.available: 3m
  1487. nodefs.inodesFree: 1m
  1488. # 硬驱逐阀值
  1489. evictionHard:
  1490. imagefs.available: 10%
  1491. memory.available: 256Mi
  1492. nodefs.available: 10%
  1493. nodefs.inodesFree: 5%
  1494. evictionMaxPodGracePeriod: 30
  1495. # 节点资源预留
  1496. kubeReserved:
  1497. cpu: 200m\$(if [[ \$(cat /proc/meminfo | awk '/MemTotal/ {print \$2}') -gt 3670016 ]]; then echo -e '\n memory: 256Mi';fi)
  1498. ephemeral-storage: 1Gi
  1499. systemReserved:
  1500. cpu: 300m\$(if [[ \$(cat /proc/meminfo | awk '/MemTotal/ {print \$2}') -gt 3670016 ]]; then echo -e '\n memory: 512Mi';fi)
  1501. ephemeral-storage: 1Gi
  1502. kubeReservedCgroup: /kube.slice
  1503. systemReservedCgroup: /system.slice
  1504. enforceNodeAllocatable:
  1505. - pods
  1506. ---
  1507. apiVersion: kubeadm.k8s.io/v1beta2
  1508. kind: ClusterConfiguration
  1509. kubernetesVersion: $KUBE_VERSION
  1510. controlPlaneEndpoint: $KUBE_APISERVER:6443
  1511. networking:
  1512. dnsDomain: $KUBE_DNSDOMAIN
  1513. podSubnet: $KUBE_POD_SUBNET
  1514. serviceSubnet: $KUBE_SERVICE_SUBNET
  1515. imageRepository: $KUBE_IMAGE_REPO
  1516. apiServer:
  1517. certSANs:
  1518. - 127.0.0.1
  1519. - $KUBE_APISERVER
  1520. $(for h in $MASTER_NODES;do echo " - $h";done)
  1521. extraArgs:
  1522. event-ttl: '720h'
  1523. service-node-port-range: '30000-50000'
  1524. # 审计日志相关配置
  1525. audit-log-maxage: '20'
  1526. audit-log-maxbackup: '10'
  1527. audit-log-maxsize: '100'
  1528. audit-log-path: /var/log/kube-audit/audit.log
  1529. audit-policy-file: /etc/kubernetes/audit-policy.yaml
  1530. extraVolumes:
  1531. - name: audit-config
  1532. hostPath: /etc/kubernetes/audit-policy.yaml
  1533. mountPath: /etc/kubernetes/audit-policy.yaml
  1534. readOnly: true
  1535. pathType: File
  1536. - name: audit-log
  1537. hostPath: /var/log/kube-audit
  1538. mountPath: /var/log/kube-audit
  1539. pathType: DirectoryOrCreate
  1540. - name: localtime
  1541. hostPath: /etc/localtime
  1542. mountPath: /etc/localtime
  1543. readOnly: true
  1544. pathType: File
  1545. controllerManager:
  1546. extraArgs:
  1547. bind-address: 0.0.0.0
  1548. node-cidr-mask-size: '24'
  1549. deployment-controller-sync-period: '10s'
  1550. node-monitor-grace-period: '20s'
  1551. pod-eviction-timeout: '2m'
  1552. terminated-pod-gc-threshold: '30'
  1553. experimental-cluster-signing-duration: 87600h
  1554. feature-gates: RotateKubeletServerCertificate=true
  1555. extraVolumes:
  1556. - hostPath: /etc/localtime
  1557. mountPath: /etc/localtime
  1558. name: localtime
  1559. readOnly: true
  1560. pathType: File
  1561. scheduler:
  1562. extraArgs:
  1563. bind-address: 0.0.0.0
  1564. extraVolumes:
  1565. - hostPath: /etc/localtime
  1566. mountPath: /etc/localtime
  1567. name: localtime
  1568. readOnly: true
  1569. pathType: File
  1570. $(if [[ "${KUBE_VERSION}" == "1.21.1" ]]; then
  1571. echo "dns:
  1572. type: CoreDNS
  1573. imageRepository: docker.io
  1574. imageTag: 1.8.0"
  1575. fi)
  1576. EOF
  1577. "
  1578. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: set kubeadmcfg.yaml" "exit"
  1579. log::info "[kubeadm init]" "${MGMT_NODE}: kubeadm init start."
  1580. command::exec "${MGMT_NODE}" "kubeadm init --config=/etc/kubernetes/kubeadmcfg.yaml --upload-certs"
  1581. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: kubeadm init" "exit"
  1582. sleep 3
  1583. log::info "[kubeadm init]" "${MGMT_NODE}: set kube config."
  1584. command::exec "${MGMT_NODE}" "
  1585. mkdir -p \$HOME/.kube
  1586. sudo cp -f /etc/kubernetes/admin.conf \$HOME/.kube/config
  1587. "
  1588. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: set kube config" "exit"
  1589. if [[ "$(echo "$MASTER_NODES" | wc -w)" == "1" ]]; then
  1590. log::info "[kubeadm init]" "${MGMT_NODE}: delete master taint"
  1591. command::exec "${MGMT_NODE}" "kubectl taint nodes --all node-role.kubernetes.io/master-"
  1592. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: delete master taint"
  1593. fi
  1594. command::exec "${MGMT_NODE}" "
  1595. kubectl create clusterrolebinding node-client-auto-approve-csr --clusterrole=system:certificates.k8s.io:certificatesigningrequests:nodeclient --user=kubelet-bootstrap
  1596. kubectl create clusterrolebinding node-client-auto-renew-crt --clusterrole=system:certificates.k8s.io:certificatesigningrequests:selfnodeclient --group=system:nodes
  1597. kubectl create clusterrolebinding node-server-auto-renew-crt --clusterrole=system:certificates.k8s.io:certificatesigningrequests:selfnodeserver --group=system:nodes
  1598. "
  1599. check::exit_code "$?" "kubeadm init" "Auto-Approve kubelet cert csr" "exit"
  1600. }
  1601. # 加入集群
  1602. function kubeadm::join() {
  1603. log::info "[kubeadm join]" "master: get join token and cert info"
  1604. command::exec "${MGMT_NODE}" "
  1605. openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
  1606. "
  1607. get::command_output "CACRT_HASH" "$?" "exit"
  1608. command::exec "${MGMT_NODE}" "
  1609. kubeadm init phase upload-certs --upload-certs --config /etc/kubernetes/kubeadmcfg.yaml 2>> /dev/null | tail -1
  1610. "
  1611. get::command_output "INTI_CERTKEY" "$?" "exit"
  1612. command::exec "${MGMT_NODE}" "
  1613. kubeadm token create
  1614. "
  1615. get::command_output "INIT_TOKEN" "$?" "exit"
  1616. command::exec "${MGMT_NODE}" "
  1617. kubeadm config images list 2>/dev/null | awk -F: '/pause/ {print \$2}'
  1618. "
  1619. get::command_output "PAUSE_VERSION" "$?"
  1620. for host in $MASTER_NODES
  1621. do
  1622. [[ "${MGMT_NODE}" == "$host" ]] && continue
  1623. log::info "[kubeadm join]" "master $host join cluster."
  1624. command::exec "${host}" "
  1625. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1626. ---
  1627. apiVersion: kubeadm.k8s.io/v1beta2
  1628. kind: JoinConfiguration
  1629. discovery:
  1630. bootstrapToken:
  1631. apiServerEndpoint: $KUBE_APISERVER:6443
  1632. caCertHashes:
  1633. - sha256:${CACRT_HASH:-}
  1634. token: ${INIT_TOKEN}
  1635. timeout: 5m0s
  1636. controlPlane:
  1637. certificateKey: ${INTI_CERTKEY:-}
  1638. ${kubelet_nodeRegistration}
  1639. EOF
  1640. kubeadm join --config /etc/kubernetes/kubeadmcfg.yaml
  1641. "
  1642. check::exit_code "$?" "kubeadm join" "master $host join cluster"
  1643. log::info "[kubeadm join]" "$host: set kube config."
  1644. command::exec "${host}" "
  1645. mkdir -p \$HOME/.kube
  1646. sudo cp -f /etc/kubernetes/admin.conf \$HOME/.kube/config
  1647. "
  1648. check::exit_code "$?" "kubeadm join" "$host: set kube config" "exit"
  1649. command::exec "${host}" "
  1650. sed -i 's#.*$KUBE_APISERVER#127.0.0.1 $KUBE_APISERVER#g' /etc/hosts
  1651. "
  1652. done
  1653. for host in $WORKER_NODES
  1654. do
  1655. log::info "[kubeadm join]" "worker $host join cluster."
  1656. command::exec "${host}" "
  1657. mkdir -p /etc/kubernetes/manifests
  1658. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1659. ---
  1660. apiVersion: kubeadm.k8s.io/v1beta2
  1661. kind: JoinConfiguration
  1662. discovery:
  1663. bootstrapToken:
  1664. apiServerEndpoint: $KUBE_APISERVER:6443
  1665. caCertHashes:
  1666. - sha256:${CACRT_HASH:-}
  1667. token: ${INIT_TOKEN}
  1668. timeout: 5m0s
  1669. ${kubelet_nodeRegistration}
  1670. EOF
  1671. kubeadm join --config /etc/kubernetes/kubeadmcfg.yaml
  1672. "
  1673. check::exit_code "$?" "kubeadm join" "worker $host join cluster"
  1674. log::info "[kubeadm join]" "set $host worker node role."
  1675. command::exec "${MGMT_NODE}" "
  1676. kubectl get node --selector='!node-role.kubernetes.io/master' | grep '<none>' | awk '{print \"kubectl label node \" \$1 \" node-role.kubernetes.io/worker= --overwrite\" }' | bash
  1677. "
  1678. check::exit_code "$?" "kubeadm join" "set $host worker node role"
  1679. done
  1680. }
  1681. # 等待资源完成
  1682. function kube::wait() {
  1683. local app=$1
  1684. local namespace=$2
  1685. local resource=$3
  1686. local selector=${4:-}
  1687. sleep 3
  1688. log::info "[waiting]" "waiting $app"
  1689. command::exec "${MGMT_NODE}" "
  1690. $(declare -f utils::retry)
  1691. utils::retry 6 kubectl wait --namespace ${namespace} \
  1692. --for=condition=ready ${resource} \
  1693. --selector=$selector \
  1694. --timeout=60s
  1695. "
  1696. local status="$?"
  1697. check::exit_code "$status" "waiting" "$app ${resource} ready"
  1698. return "$status"
  1699. }
  1700. # 应用manifest
  1701. function kube::apply() {
  1702. local file=$1
  1703. log::info "[apply]" "$file"
  1704. command::exec "${MGMT_NODE}" "
  1705. $(declare -f utils::retry)
  1706. if [ -f \"$file\" ]; then
  1707. utils::retry 6 kubectl apply --wait=true --timeout=10s -f \"$file\"
  1708. else
  1709. utils::retry 6 \"cat <<EOF | kubectl apply --wait=true --timeout=10s -f -
  1710. \$(printf \"%s\" \"${2:-}\")
  1711. EOF
  1712. \"
  1713. fi
  1714. "
  1715. local status="$?"
  1716. check::exit_code "$status" "apply" "add $file" "exit"
  1717. return "$status"
  1718. }
  1719. # 集群状态
  1720. function kube::status() {
  1721. sleep 5
  1722. log::info "[cluster]" "cluster status"
  1723. command::exec "${MGMT_NODE}" "
  1724. echo
  1725. kubectl get node -o wide
  1726. echo
  1727. kubectl get pods -A
  1728. " && printf "%s" "${COMMAND_OUTPUT}"
  1729. }
  1730. # 添加或删除haproxy的后端server
  1731. function config::haproxy_backend() {
  1732. local action=${1:-add}
  1733. local action_cmd=""
  1734. local master_nodes
  1735. if [[ "$MASTER_NODES" == "" || "$MASTER_NODES" == "127.0.0.1" ]]; then
  1736. return
  1737. fi
  1738. command::exec "${MGMT_NODE}" "
  1739. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1740. "
  1741. get::command_output "master_nodes" "$?" "exit"
  1742. for m in $MASTER_NODES
  1743. do
  1744. if [[ "${action}" == "add" ]]; then
  1745. num=$(echo "${m}"| awk -F'.' '{print $4}')
  1746. action_cmd="${action_cmd}\necho \" server apiserver${num} ${m}:6443 check\" >> /etc/haproxy/haproxy.cfg"
  1747. else
  1748. [[ "${master_nodes}" == *"${m}"* ]] || return
  1749. action_cmd="${action_cmd}\n sed -i -e \"/${m}/d\" /etc/haproxy/haproxy.cfg"
  1750. fi
  1751. done
  1752. command::exec "${MGMT_NODE}" "
  1753. kubectl get node --selector='!node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1754. "
  1755. get::command_output "worker_nodes" "$?"
  1756. for host in ${worker_nodes:-}
  1757. do
  1758. log::info "[config]" "worker ${host}: ${action} apiserver from haproxy"
  1759. command::exec "${host}" "
  1760. $(echo -ne "${action_cmd}")
  1761. haproxy -c -f /etc/haproxy/haproxy.cfg && systemctl reload haproxy
  1762. "
  1763. check::exit_code "$?" "config" "worker ${host}: ${action} apiserver(${m}) from haproxy"
  1764. done
  1765. }
  1766. # 更新 etcd 备份副本
  1767. function config::etcd_snapshot() {
  1768. command::exec "${MGMT_NODE}" "
  1769. count=\$(kubectl get node --selector='node-role.kubernetes.io/master' --no-headers | wc -l)
  1770. kubectl -n kube-system patch cronjobs etcd-snapshot --patch \"
  1771. spec:
  1772. jobTemplate:
  1773. spec:
  1774. completions: \${count:-1}
  1775. parallelism: \${count:-1}
  1776. \"
  1777. "
  1778. check::exit_code "$?" "config" "etcd-snapshot completions options"
  1779. }
  1780. # 获取命令的返回值
  1781. function get::command_output() {
  1782. local app="$1"
  1783. local status="$2"
  1784. local is_exit="${3:-}"
  1785. if [[ "$status" == "0" && "${COMMAND_OUTPUT}" != "" ]]; then
  1786. log::info "[command]" "get $app value succeeded."
  1787. eval "$app=\"${COMMAND_OUTPUT}\""
  1788. else
  1789. log::error "[command]" "get $app value failed."
  1790. [[ "$is_exit" == "exit" ]] && exit "$status"
  1791. fi
  1792. return "$status"
  1793. }
  1794. # 获取ingress连接地址
  1795. function get::ingress_conn() {
  1796. local port="${1:-80}"
  1797. local ingress_name="${2:-ingress-${KUBE_INGRESS}-controller}"
  1798. command::exec "${MGMT_NODE}" "
  1799. kubectl get node -o jsonpath='{range .items[*]}{ .status.addresses[?(@.type==\"InternalIP\")].address} {.status.conditions[?(@.status == \"True\")].status}{\"\\n\"}{end}' | awk '{if(\$2==\"True\")a=\$1}END{print a}'
  1800. "
  1801. get::command_output "node_ip" "$?"
  1802. command::exec "${MGMT_NODE}" "
  1803. kubectl get svc --all-namespaces -o go-template=\"{{range .items}}{{if eq .metadata.name \\\"${ingress_name}\\\"}}{{range.spec.ports}}{{if eq .port ${port}}}{{.nodePort}}{{end}}{{end}}{{end}}{{end}}\"
  1804. "
  1805. get::command_output "node_port" "$?"
  1806. INGRESS_CONN="${node_ip:-nodeIP}:${node_port:-nodePort}"
  1807. }
  1808. ######################################################################################################
  1809. # 主调用逻辑
  1810. ######################################################################################################
  1811. # 添加network组件
  1812. function add::network() {
  1813. if [[ "$KUBE_NETWORK" == "flannel" ]]; then
  1814. log::info "[network]" "add flannel"
  1815. local flannel_file="${OFFLINE_DIR}/manifests/kube-flannel.yml"
  1816. utils::download_file "https://cdn.jsdelivr.net/gh/coreos/flannel@v${FLANNEL_VERSION}/Documentation/kube-flannel.yml" "${flannel_file}"
  1817. command::exec "${MGMT_NODE}" "
  1818. sed -i -e 's#10.244.0.0/16#${KUBE_POD_SUBNET}#g' \
  1819. -e 's#quay.io/coreos#${KUBE_IMAGE_REPO}#g' \
  1820. -e 's#\"Type\": \"vxlan\"#\"Type\": \"${KUBE_FLANNEL_TYPE}\"#g' \"${flannel_file}\"
  1821. if [[ \"${KUBE_FLANNEL_TYPE}\" == \"vxlan\" ]]; then
  1822. sed -i 's#\"Type\": \"vxlan\"#\"Type\": \"vxlan\", \"DirectRouting\": true#g' \"${flannel_file}\"
  1823. fi
  1824. "
  1825. check::exit_code "$?" "flannel" "change flannel pod subnet"
  1826. kube::apply "${flannel_file}"
  1827. kube::wait "flannel" "kube-system" "pods" "app=flannel"
  1828. elif [[ "$KUBE_NETWORK" == "calico" ]]; then
  1829. log::info "[network]" "add calico"
  1830. utils::download_file "https://projectcalico.docs.tigera.io/archive/v${CALICO_VERSION%.*}/manifests/calico.yaml" "${OFFLINE_DIR}/manifests/calico.yaml"
  1831. utils::download_file "https://projectcalico.docs.tigera.io/archive/v${CALICO_VERSION%.*}/manifests/calicoctl.yaml" "${OFFLINE_DIR}/manifests/calicoctl.yaml"
  1832. command::exec "${MGMT_NODE}" "
  1833. sed -i \"s#:v.*#:v${CALICO_VERSION}#g\" \"${OFFLINE_DIR}/manifests/calico.yaml\"
  1834. sed -i 's#value: \"Always\"#value: \"CrossSubnet\"#g' \"${OFFLINE_DIR}/manifests/calico.yaml\"
  1835. sed -i \"s#:v.*#:v${CALICO_VERSION}#g\" \"${OFFLINE_DIR}/manifests/calicoctl.yaml\"
  1836. "
  1837. check::exit_code "$?" "network" "change calico version to ${CALICO_VERSION}"
  1838. kube::apply "${OFFLINE_DIR}/manifests/calico.yaml"
  1839. kube::apply "${OFFLINE_DIR}/manifests/calicoctl.yaml"
  1840. kube::wait "calico-kube-controllers" "kube-system" "pods" "k8s-app=calico-kube-controllers"
  1841. kube::wait "calico-node" "kube-system" "pods" "k8s-app=calico-node"
  1842. elif [[ "$KUBE_NETWORK" == "cilium" ]]; then
  1843. log::info "[network]" "add cilium"
  1844. local cilium_file="${OFFLINE_DIR}/manifests/cilium.yml"
  1845. local cilium_hubble_file="${OFFLINE_DIR}/manifests/cilium_hubble.yml"
  1846. utils::download_file "https://cdn.jsdelivr.net/gh/cilium/cilium@${CILIUM_VERSION}/install/kubernetes/quick-install.yaml" "${cilium_file}"
  1847. utils::download_file "https://cdn.jsdelivr.net/gh/cilium/cilium@${CILIUM_VERSION}/install/kubernetes/quick-hubble-install.yaml" "${cilium_hubble_file}"
  1848. local all_node=""
  1849. if [[ "${MASTER_NODES}" == "" && "${WORKER_NODES}" == "" ]]; then
  1850. command::exec "${MGMT_NODE}" "
  1851. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {end}'
  1852. "
  1853. get::command_output "all_node" "$?"
  1854. else
  1855. all_node="${MASTER_NODES} ${WORKER_NODES}"
  1856. fi
  1857. for host in $all_node
  1858. do
  1859. command::exec "${host}" "mount bpffs -t bpf /sys/fs/bpf"
  1860. check::exit_code "$?" "network" "${host}: mount bpf filesystem"
  1861. done
  1862. command::exec "${MGMT_NODE}" "
  1863. sed -i \"s#10.0.0.0/8#${KUBE_POD_SUBNET}#g\" \"${cilium_file}\"
  1864. "
  1865. kube::apply "${cilium_file}"
  1866. kube::wait "cilium-node" "kube-system" "pods" "k8s-app=cilium"
  1867. kube::wait "cilium-operator" "kube-system" "pods" "name=cilium-operator"
  1868. kube::apply "${cilium_hubble_file}"
  1869. kube::wait "hubble-relay" "kube-system" "pods" "k8s-app=hubble-relay"
  1870. log::info "[monitor]" "add hubble-ui ingress"
  1871. kube::apply "hubble-ui ingress" "
  1872. ---
  1873. apiVersion: networking.k8s.io/v1
  1874. kind: Ingress
  1875. metadata:
  1876. name: hubble-ui
  1877. namespace: kube-system
  1878. annotations:
  1879. kubernetes.io/ingress.class: ${KUBE_INGRESS}
  1880. spec:
  1881. rules:
  1882. - host: hubble-ui.cluster.local
  1883. http:
  1884. paths:
  1885. - path: /
  1886. pathType: Prefix
  1887. backend:
  1888. service:
  1889. name: hubble-ui
  1890. port:
  1891. number: 80
  1892. "
  1893. # shellcheck disable=SC2181
  1894. if [[ "$?" == "0" ]]; then
  1895. get::ingress_conn
  1896. log::access "[ingress]" "curl -H 'Host:hubble-ui.cluster.local' http://${INGRESS_CONN}"
  1897. fi
  1898. else
  1899. log::warning "[network]" "No $KUBE_NETWORK config."
  1900. fi
  1901. }
  1902. # 添加ingress组件
  1903. function add::ingress() {
  1904. # 安装 ingress-nginx
  1905. log::info "[ingress]" "add ingress-nginx"
  1906. command::exec "${MGMT_NODE}" "
  1907. $(declare -f utils::retry)
  1908. helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
  1909. utils::retry 6 helm install ingress-nginx ingress-nginx/ingress-nginx \
  1910. --namespace ingress-nginx --create-namespace \
  1911. --version ${INGRESS_NGINX} \
  1912. --set controller.admissionWebhooks.patch.image.registry=registry.hub.docker.com \
  1913. --set controller.admissionWebhooks.patch.image.image=k8sgcrioingressnginx/kube-webhook-certgen \
  1914. --set controller.admissionWebhooks.patch.image.digest= \
  1915. --set controller.admissionWebhooks.enabled=true \
  1916. --set controller.kind=DaemonSet \
  1917. --set controller.replicaCount=1 \
  1918. --set controller.minAvailable=1 \
  1919. --set controller.image.registry=registry.hub.docker.com \
  1920. --set controller.image.image=k8sgcrioingressnginx/controller \
  1921. --set controller.image.digest= \
  1922. --set controller.ingressClassResource.name=nginx \
  1923. --set controller.ingressClassResource.enable=true \
  1924. --set controller.ingressClassResource.default=false \
  1925. --set controller.service.enabled=true \
  1926. --set controller.service.type=NodePort \
  1927. --set controller.service.enableHttp=true \
  1928. --set controller.service.enableHttps=true \
  1929. --set controller.service.nodePorts.http=30080 \
  1930. --set controller.service.nodePorts.https=30443 \
  1931. --set defaultBackend.image.registry=registry.hub.docker.com \
  1932. --set defaultBackend.image.image=gcmirrors/defaultbackend-amd64 \
  1933. --set defaultBackend.enabled=true \
  1934. --set defaultBackend.name=defaultbackend \
  1935. --set defaultBackend.replicaCount=1 \
  1936. --set defaultBackend.minAvailable=1 \
  1937. --set rbac.create=true \
  1938. --set serviceAccount.create=true \
  1939. --set podSecurityPolicy.enabled=true
  1940. kubectl get pod -n ingress-nginx -o wide
  1941. kubectl get svc -n ingress-nginx -o wide
  1942. "
  1943. # 安装 nginx
  1944. log::info "[nginx]" "add nginx"
  1945. command::exec "${MGMT_NODE}" "
  1946. sudo yum -y install nginx
  1947. nginx -v
  1948. sudo systemctl enable nginx
  1949. sudo service nginx start
  1950. cat << EOF > /etc/nginx/conf.d/k8s.ingress.conf
  1951. upstream k8s-ingress {
  1952. $(for h in $MASTER_NODES $WORKER_NODES;do echo " server $h:30080 max_fails=1 fail_timeout=15s;";done)
  1953. keepalive 128;
  1954. }
  1955. server {
  1956. listen ${NGINX_HTTP_PORT};
  1957. location / {
  1958. proxy_http_version 1.1;
  1959. proxy_set_header Connection \"\";
  1960. proxy_next_upstream error;
  1961. proxy_set_header X-Real-IP \\\$remote_addr;
  1962. proxy_set_header X-Forwarded-For \\\$proxy_add_x_forwarded_for;
  1963. proxy_set_header Host \\\$http_host;
  1964. proxy_set_header X-Nginx-Proxy true;
  1965. proxy_pass http://k8s-ingress/;
  1966. }
  1967. }
  1968. EOF
  1969. sudo nginx -s reload
  1970. "
  1971. }
  1972. # 添加addon组件
  1973. function add::addon() {
  1974. # TODO add addon
  1975. log::warning "[TODO]" "add addon"
  1976. }
  1977. # 添加监控组件
  1978. function add::monitor() {
  1979. # TODO add monitor
  1980. log::warning "[TODO]" "add monitor"
  1981. }
  1982. # 添加log组件
  1983. function add::log() {
  1984. # TODO add log
  1985. log::warning "[TODO]" "add log"
  1986. }
  1987. # 添加存储
  1988. function add::storage() {
  1989. # TODO add storage
  1990. log::warning "[TODO]" "add storage"
  1991. }
  1992. # 添加用户界面
  1993. function add::ui() {
  1994. local path="/tmp"
  1995. # 安装 rancher
  1996. log::info "[rancher]" "add rancher"
  1997. command::exec "${MGMT_NODE}" "
  1998. $(declare -f utils::retry)
  1999. cd ${path}
  2000. helm repo add rancher-stable http://rancher-mirror.oss-cn-beijing.aliyuncs.com/server-charts/stable
  2001. utils::retry 6 helm pull rancher-stable/rancher --version ${RANCHER_VERSION} --untar
  2002. cat << EOF > rancher/templates/service.yaml
  2003. apiVersion: v1
  2004. kind: Service
  2005. metadata:
  2006. name: {{ template \"rancher.fullname\" . }}
  2007. labels:
  2008. {{ include \"rancher.labels\" . | indent 4 }}
  2009. spec:
  2010. type: NodePort
  2011. ports:
  2012. - port: 80
  2013. targetPort: 80
  2014. protocol: TCP
  2015. name: http
  2016. # 使用nodePort端口
  2017. nodePort: 31080
  2018. - port: 443
  2019. targetPort: 444
  2020. protocol: TCP
  2021. name: https-internal
  2022. # 使用nodePort端口
  2023. nodePort: 31443
  2024. selector:
  2025. app: {{ template \"rancher.fullname\" . }}
  2026. EOF
  2027. helm install rancher ./rancher \
  2028. --namespace cattle-system --create-namespace \
  2029. --set replicas=1 \
  2030. --set ingress.tls.source=secret \
  2031. --set ingress.enabled=false
  2032. "
  2033. log::info "[rancher]" "获取初始密码 kubectl get secret --namespace cattle-system bootstrap-secret -o go-template='{{.data.bootstrapPassword|base64decode}}{{ \"\n\" }}'"
  2034. log::info "[rancher]" "重置初始密码 kubectl -n cattle-system exec \$(kubectl -n cattle-system get pods -l app=rancher | grep '1/1' | head -1 | awk '{ print \$1 }') -- reset-password"
  2035. }
  2036. # 运维操作
  2037. function add::ops() {
  2038. local master_num
  2039. master_num=$(awk '{print NF}' <<< "${MASTER_NODES}")
  2040. log::info "[ops]" "add anti-affinity strategy to coredns"
  2041. command::exec "${MGMT_NODE}" """
  2042. kubectl -n kube-system patch deployment coredns --patch '{\"spec\": {\"template\": {\"spec\": {\"affinity\":{\"podAntiAffinity\":{\"preferredDuringSchedulingIgnoredDuringExecution\":[{\"weight\":100,\"podAffinityTerm\":{\"labelSelector\":{\"matchExpressions\":[{\"key\":\"k8s-app\",\"operator\":\"In\",\"values\":[\"kube-dns\"]}]},\"topologyKey\":\"kubernetes.io/hostname\"}}]}}}}}}' --record
  2043. """
  2044. check::exit_code "$?" "ops" "add anti-affinity strategy to coredns"
  2045. log::info "[ops]" "add etcd snapshot cronjob"
  2046. command::exec "${MGMT_NODE}" "
  2047. kubeadm config images list --config=/etc/kubernetes/kubeadmcfg.yaml 2>/dev/null | grep etcd:
  2048. "
  2049. get::command_output "etcd_image" "$?"
  2050. command::exec "${MGMT_NODE}" "
  2051. kubectl get node --selector='node-role.kubernetes.io/master' --no-headers | wc -l
  2052. "
  2053. get::command_output "master_num" "$?"
  2054. [[ "${master_num:-0}" == "0" ]] && master_num=1
  2055. kube::apply "etcd-snapshot" """
  2056. ---
  2057. apiVersion: batch/v1beta1
  2058. kind: CronJob
  2059. metadata:
  2060. name: etcd-snapshot
  2061. namespace: kube-system
  2062. spec:
  2063. schedule: '0 */6 * * *'
  2064. successfulJobsHistoryLimit: 3
  2065. suspend: false
  2066. concurrencyPolicy: Allow
  2067. failedJobsHistoryLimit: 3
  2068. jobTemplate:
  2069. spec:
  2070. backoffLimit: 6
  2071. parallelism: ${master_num}
  2072. completions: ${master_num}
  2073. template:
  2074. metadata:
  2075. labels:
  2076. app: etcd-snapshot
  2077. spec:
  2078. affinity:
  2079. podAntiAffinity:
  2080. requiredDuringSchedulingIgnoredDuringExecution:
  2081. - labelSelector:
  2082. matchExpressions:
  2083. - key: app
  2084. operator: In
  2085. values:
  2086. - etcd-snapshot
  2087. topologyKey: 'kubernetes.io/hostname'
  2088. containers:
  2089. - name: etcd-snapshot
  2090. image: ${etcd_image:-${KUBE_IMAGE_REPO}/etcd:3.4.13-0}
  2091. imagePullPolicy: IfNotPresent
  2092. args:
  2093. - -c
  2094. - etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt
  2095. --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key
  2096. snapshot save /backup/etcd-snapshot-\\\\\\\$(date +%Y-%m-%d_%H:%M:%S_%Z).db
  2097. && echo 'delete old backups' && { find /backup -type f -mtime +30 -exec rm -fv {} \\; || echo error; }
  2098. command:
  2099. - /usr/bin/bash
  2100. env:
  2101. - name: ETCDCTL_API
  2102. value: '3'
  2103. resources: {}
  2104. terminationMessagePath: /dev/termination-log
  2105. terminationMessagePolicy: File
  2106. volumeMounts:
  2107. - name: etcd-certs
  2108. mountPath: /etc/kubernetes/pki/etcd
  2109. readOnly: true
  2110. - name: backup
  2111. mountPath: /backup
  2112. - name: etc
  2113. mountPath: /etc
  2114. - name: bin
  2115. mountPath: /usr/bin
  2116. - name: lib64
  2117. mountPath: /lib64
  2118. dnsPolicy: ClusterFirst
  2119. hostNetwork: true
  2120. nodeSelector:
  2121. node-role.kubernetes.io/master: ''
  2122. tolerations:
  2123. - effect: NoSchedule
  2124. operator: Exists
  2125. restartPolicy: OnFailure
  2126. schedulerName: default-scheduler
  2127. securityContext: {}
  2128. terminationGracePeriodSeconds: 30
  2129. volumes:
  2130. - name: etcd-certs
  2131. hostPath:
  2132. path: /etc/kubernetes/pki/etcd
  2133. type: DirectoryOrCreate
  2134. - name: backup
  2135. hostPath:
  2136. path: /var/lib/etcd/backups
  2137. type: DirectoryOrCreate
  2138. - name: etc
  2139. hostPath:
  2140. path: /etc
  2141. - name: bin
  2142. hostPath:
  2143. path: /usr/bin
  2144. - name: lib64
  2145. hostPath:
  2146. path: /lib64
  2147. """
  2148. # shellcheck disable=SC2181
  2149. [[ "$?" == "0" ]] && log::access "[ops]" "etcd backup directory: /var/lib/etcd/backups"
  2150. command::exec "${MGMT_NODE}" "
  2151. jobname=\"etcd-snapshot-$(date +%s)\"
  2152. kubectl create job --from=cronjob/etcd-snapshot \${jobname} -n kube-system && \
  2153. kubectl wait --for=condition=complete job/\${jobname} -n kube-system
  2154. "
  2155. check::exit_code "$?" "ops" "trigger etcd backup"
  2156. }
  2157. # 重置节点
  2158. function reset::node() {
  2159. local host=$1
  2160. log::info "[reset]" "node $host"
  2161. command::exec "${host}" "
  2162. set +ex
  2163. cri_socket=\"\"
  2164. [ -S /var/run/crio/crio.sock ] && cri_socket=\"--cri-socket /var/run/crio/crio.sock\"
  2165. [ -S /run/containerd/containerd.sock ] && cri_socket=\"--cri-socket /run/containerd/containerd.sock\"
  2166. kubeadm reset -f \$cri_socket
  2167. [ -f \"\$(which kubelet)\" ] && { systemctl stop kubelet; find /var/lib/kubelet | xargs -n 1 findmnt -n -o TARGET -T | sort | uniq | xargs -r umount -v; yum remove -y kubeadm kubelet kubectl; }
  2168. [ -d /etc/kubernetes ] && rm -rf /etc/kubernetes/* /var/lib/kubelet/* /var/lib/etcd/* \$HOME/.kube /etc/cni/net.d/* /var/lib/dockershim/* /var/lib/cni/* /var/run/kubernetes/*
  2169. [ -f \"\$(which docker)\" ] && { docker rm -f -v \$(docker ps | grep kube | awk '{print \$1}'); systemctl stop docker; rm -rf \$HOME/.docker /etc/docker/* /var/lib/docker/*; yum remove -y docker; }
  2170. [ -f \"\$(which containerd)\" ] && { crictl rm \$(crictl ps -a -q); systemctl stop containerd; rm -rf /etc/containerd/* /var/lib/containerd/*; yum remove -y containerd.io; }
  2171. [ -f \"\$(which crio)\" ] && { crictl rm \$(crictl ps -a -q); systemctl stop crio; rm -rf /etc/crictl.yaml /etc/crio/* /var/run/crio/*; yum remove -y cri-o; }
  2172. [ -f \"\$(which runc)\" ] && { find /run/containers/ /var/lib/containers/ | xargs -n 1 findmnt -n -o TARGET -T | sort | uniq | xargs -r umount -v; rm -rf /var/lib/containers/* /var/run/containers/*; yum remove -y runc; }
  2173. [ -f \"\$(which haproxy)\" ] && { systemctl stop haproxy; rm -rf /etc/haproxy/*; yum remove -y haproxy; }
  2174. sed -i -e \"/$KUBE_APISERVER/d\" -e '/-worker-/d' -e '/-master-/d' /etc/hosts
  2175. sed -i '/## Kainstall managed start/,/## Kainstall managed end/d' /etc/security/limits.conf /etc/systemd/system.conf /etc/bashrc /etc/rc.local /etc/audit/rules.d/audit.rules
  2176. [ -d /var/lib/elasticsearch ] && rm -rf /var/lib/elasticsearch/*
  2177. [ -d /var/lib/longhorn ] && rm -rf /var/lib/longhorn/*
  2178. [ -d \"${OFFLINE_DIR:-/tmp/abc}\" ] && rm -rf \"${OFFLINE_DIR:-/tmp/abc}\"
  2179. for repo in kubernetes.repo docker-ce.repo devel_kubic_libcontainers_stable.repo elrepo.repo
  2180. do
  2181. [ -f /etc/yum.repos.d/\${repo} ] && rm -f /etc/yum.repos.d/\${repo}
  2182. done
  2183. ipvsadm --clear
  2184. iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
  2185. for int in kube-ipvs0 cni0 docker0 dummy0 flannel.1 cilium_host cilium_net cilium_vxlan lxc_health nodelocaldns
  2186. do
  2187. [ -d /sys/class/net/\${int} ] && ip link delete \${int}
  2188. done
  2189. modprobe -r ipip
  2190. echo done.
  2191. "
  2192. check::exit_code "$?" "reset" "$host: reset"
  2193. }
  2194. # 重置所有节点
  2195. function reset::cluster() {
  2196. local all_node=""
  2197. command::exec "${MGMT_NODE}" "
  2198. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {end}'
  2199. "
  2200. get::command_output "all_node" "$?"
  2201. all_node=$(echo "${WORKER_NODES} ${MASTER_NODES} ${all_node}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}')
  2202. for host in $all_node
  2203. do
  2204. reset::node "$host"
  2205. done
  2206. }
  2207. # 节点加载离线包
  2208. function offline::load() {
  2209. local role="${1:-}"
  2210. local hosts=""
  2211. if [[ "${role}" == "master" ]]; then
  2212. hosts="${MASTER_NODES}"
  2213. elif [[ "${role}" == "worker" ]]; then
  2214. hosts="${WORKER_NODES}"
  2215. fi
  2216. for host in ${hosts}
  2217. do
  2218. log::info "[offline]" "${role} ${host}: load offline file"
  2219. command::exec "${host}" "[[ ! -d \"${OFFLINE_DIR}\" ]] && { mkdir -pv \"${OFFLINE_DIR}\"; chmod 777 \"${OFFLINE_DIR}\"; } ||:"
  2220. check::exit_code "$?" "offline" "$host: mkdir offline dir" "exit"
  2221. if [[ "${UPGRADE_KERNEL_TAG:-}" == "1" ]]; then
  2222. command::scp "${host}" "${TMP_DIR}/packages/kernel/*" "${OFFLINE_DIR}"
  2223. check::exit_code "$?" "offline" "scp kernel file to $host" "exit"
  2224. else
  2225. log::info "[offline]" "${role} ${host}: copy offline file"
  2226. command::scp "${host}" "${TMP_DIR}/packages/kubeadm/*" "${OFFLINE_DIR}"
  2227. check::exit_code "$?" "offline" "scp kube file to $host" "exit"
  2228. command::scp "${host}" "${TMP_DIR}/packages/all/*" "${OFFLINE_DIR}"
  2229. check::exit_code "$?" "offline" "scp all file to $host" "exit"
  2230. if [[ "${role}" == "worker" ]]; then
  2231. command::scp "${host}" "${TMP_DIR}/packages/worker/*" "${OFFLINE_DIR}"
  2232. check::exit_code "$?" "offline" "scp worker file to $host" "exit"
  2233. fi
  2234. command::scp "${host}" "${TMP_DIR}/images/${role}.tgz" "${OFFLINE_DIR}"
  2235. check::exit_code "$?" "offline" "scp ${role} images to $host" "exit"
  2236. command::scp "${host}" "${TMP_DIR}/images/all.tgz" "${OFFLINE_DIR}"
  2237. check::exit_code "$?" "offline" "scp all images to $host" "exit"
  2238. fi
  2239. log::info "[offline]" "${role} ${host}: install package"
  2240. command::exec "${host}" "yum localinstall -y --skip-broken ${OFFLINE_DIR}/*.rpm"
  2241. check::exit_code "$?" "offline" "${role} ${host}: install package" "exit"
  2242. if [[ "${UPGRADE_KERNEL_TAG:-}" != "1" ]]; then
  2243. command::exec "${host}" "
  2244. set -e
  2245. for target in firewalld python-firewall firewalld-filesystem iptables; do
  2246. systemctl stop \$target &>/dev/null || true
  2247. systemctl disable \$target &>/dev/null || true
  2248. done
  2249. systemctl start docker && \
  2250. cd ${OFFLINE_DIR} && \
  2251. gzip -d -c ${1}.tgz | docker load && gzip -d -c all.tgz | docker load
  2252. "
  2253. check::exit_code "$?" "offline" "$host: load images" "exit"
  2254. fi
  2255. command::exec "${host}" "rm -rf ${OFFLINE_DIR:-/tmp/abc}"
  2256. check::exit_code "$?" "offline" "$host: clean offline file"
  2257. done
  2258. command::scp "${MGMT_NODE}" "${TMP_DIR}/manifests" "${OFFLINE_DIR}"
  2259. check::exit_code "$?" "offline" "scp manifests file to ${MGMT_NODE}" "exit"
  2260. command::scp "${MGMT_NODE}" "${TMP_DIR}/bins" "${OFFLINE_DIR}"
  2261. check::exit_code "$?" "offline" "scp bins file to ${MGMT_NODE}" "exit"
  2262. }
  2263. # 集群节点加载离线包
  2264. function offline::cluster() {
  2265. [ ! -f "${OFFLINE_FILE}" ] && { log::error "[offline]" "not found ${OFFLINE_FILE}" ; exit 1; }
  2266. log::info "[offline]" "Unzip offline package on local."
  2267. tar zxf "${OFFLINE_FILE}" -C "${TMP_DIR}/"
  2268. check::exit_code "$?" "offline" "Unzip offline package"
  2269. offline::load "master"
  2270. offline::load "worker"
  2271. }
  2272. # 初始化集群
  2273. function init::cluster() {
  2274. MGMT_NODE=$(echo "${MASTER_NODES}" | awk '{print $1}')
  2275. # 加载离线包
  2276. [[ "${OFFLINE_TAG:-}" == "1" ]] && offline::cluster
  2277. # 1. 初始化节点
  2278. init::node
  2279. # 2. 安装包
  2280. install::package
  2281. # 3. 初始化kubeadm
  2282. kubeadm::init
  2283. # 4. 加入集群
  2284. kubeadm::join
  2285. # 5. 添加network
  2286. add::network
  2287. # 6. 安装addon
  2288. add::addon
  2289. # 7. 添加ingress
  2290. add::ingress
  2291. # 8. 添加storage
  2292. [[ "${STORAGE_TAG:-}" == "1" ]] && add::storage
  2293. # 9. 添加web ui
  2294. add::ui
  2295. # 10. 添加monitor
  2296. [[ "${MONITOR_TAG:-}" == "1" ]] && add::monitor
  2297. # 11. 添加log
  2298. [[ "${LOG_TAG:-}" == "1" ]] && add::log
  2299. # 12. 运维操作
  2300. add::ops
  2301. # 13. 查看集群状态
  2302. kube::status
  2303. }
  2304. # 添加节点
  2305. function add::node() {
  2306. # 加载离线包
  2307. [[ "${OFFLINE_TAG:-}" == "1" ]] && offline::cluster
  2308. # KUBE_VERSION未指定时,获取集群的版本
  2309. if [[ "${KUBE_VERSION}" == "" || "${KUBE_VERSION}" == "latest" ]]; then
  2310. command::exec "${MGMT_NODE}" "
  2311. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{range.items[*]}{.status.nodeInfo.kubeletVersion } {end}' | awk -F'v| ' '{print \$2}'
  2312. "
  2313. get::command_output "KUBE_VERSION" "$?" "exit"
  2314. fi
  2315. # 1. 初始化节点
  2316. init::add_node
  2317. # 2. 安装包
  2318. install::package
  2319. # 3. 加入集群
  2320. kubeadm::join
  2321. # 4. haproxy添加apiserver
  2322. config::haproxy_backend "add"
  2323. # 5. 更新 etcd snapshot 副本
  2324. config::etcd_snapshot
  2325. # 6. 查看集群状态
  2326. kube::status
  2327. }
  2328. # 删除节点
  2329. function del::node() {
  2330. config::haproxy_backend "remove"
  2331. local cluster_nodes=""
  2332. local del_hosts_cmd=""
  2333. command::exec "${MGMT_NODE}" "
  2334. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {.metadata.name }\\n{end}'
  2335. "
  2336. get::command_output "cluster_nodes" "$?" exit
  2337. for host in $MASTER_NODES
  2338. do
  2339. command::exec "${MGMT_NODE}" "
  2340. etcd_pod=\$(kubectl -n kube-system get pods -l component=etcd --field-selector=status.phase=Running -o jsonpath='{\$.items[0].metadata.name}')
  2341. etcd_node=\$(kubectl -n kube-system exec \$etcd_pod -- sh -c \"export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/kubernetes/pki/etcd/ca.crt ETCDCTL_CERT=/etc/kubernetes/pki/etcd/server.crt ETCDCTL_KEY=/etc/kubernetes/pki/etcd/server.key ETCDCTL_ENDPOINTS=https://127.0.0.1:2379; etcdctl member list\"| grep $host | awk -F, '{print \$1}')
  2342. echo \"\$etcd_pod \$etcd_node\"
  2343. kubectl -n kube-system exec \$etcd_pod -- sh -c \"export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/kubernetes/pki/etcd/ca.crt ETCDCTL_CERT=/etc/kubernetes/pki/etcd/server.crt ETCDCTL_KEY=/etc/kubernetes/pki/etcd/server.key ETCDCTL_ENDPOINTS=https://127.0.0.1:2379; etcdctl member remove \$etcd_node; etcdctl member list\"
  2344. "
  2345. check::exit_code "$?" "del" "remove $host etcd member"
  2346. done
  2347. for host in $MASTER_NODES $WORKER_NODES
  2348. do
  2349. log::info "[del]" "node $host"
  2350. local node_name; node_name=$(echo -ne "${cluster_nodes}" | grep "${host}" | awk '{print $2}')
  2351. if [[ "${node_name}" == "" ]]; then
  2352. log::warning "[del]" "node $host not found."
  2353. read -r -t 10 -n 1 -p "Do you need to reset the node (y/n)? " answer
  2354. [[ -z "$answer" || "$answer" != "y" ]] && exit || echo
  2355. else
  2356. log::info "[del]" "drain $host"
  2357. command::exec "${MGMT_NODE}" "kubectl drain $node_name --force --ignore-daemonsets --delete-local-data"
  2358. check::exit_code "$?" "del" "$host: drain"
  2359. log::info "[del]" "delete node $host"
  2360. command::exec "${MGMT_NODE}" "kubectl delete node $node_name"
  2361. check::exit_code "$?" "del" "$host: delete"
  2362. sleep 3
  2363. fi
  2364. reset::node "$host"
  2365. del_hosts_cmd="${del_hosts_cmd}\nsed -i "/$host/d" /etc/hosts"
  2366. done
  2367. for host in $(echo -ne "${cluster_nodes}" | awk '{print $1}')
  2368. do
  2369. log::info "[del]" "$host: remove del node hostname resolution"
  2370. command::exec "${host}" "
  2371. $(echo -ne "${del_hosts_cmd}")
  2372. "
  2373. check::exit_code "$?" "del" "remove del node hostname resolution"
  2374. done
  2375. [ "$MASTER_NODES" != "" ] && config::etcd_snapshot
  2376. kube::status
  2377. }
  2378. # 升级集群
  2379. function upgrade::cluster() {
  2380. log::info "[upgrade]" "upgrade to $KUBE_VERSION"
  2381. log::info "[upgrade]" "backup cluster"
  2382. add::ops
  2383. local stable_version="2"
  2384. command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"
  2385. get::command_output "stable_version" "$?" && stable_version="${stable_version#v}"
  2386. local node_hosts="$MASTER_NODES $WORKER_NODES"
  2387. if [[ "$node_hosts" == " " ]]; then
  2388. command::exec "${MGMT_NODE}" "
  2389. kubectl get node -o jsonpath='{range.items[*]}{.metadata.name } {end}'
  2390. "
  2391. get::command_output "node_hosts" "$?" exit
  2392. fi
  2393. local skip_plan=${SKIP_UPGRADE_PLAN,,}
  2394. for host in ${node_hosts}
  2395. do
  2396. log::info "[upgrade]" "node: $host"
  2397. local local_version=""
  2398. command::exec "${host}" "kubectl version --client --short | awk '{print \$3}'"
  2399. get::command_output "local_version" "$?" && local_version="${local_version#v}"
  2400. if [[ "${KUBE_VERSION}" != "latest" ]]; then
  2401. if [[ "${KUBE_VERSION}" == "${local_version}" ]];then
  2402. log::warning "[check]" "The specified version(${KUBE_VERSION}) is consistent with the local version(${local_version})!"
  2403. continue
  2404. fi
  2405. if [[ $(utils::version_to_number "$KUBE_VERSION") -lt $(utils::version_to_number "${local_version}") ]];then
  2406. log::warning "[check]" "The specified version($KUBE_VERSION) is less than the local version(${local_version})!"
  2407. continue
  2408. fi
  2409. if [[ $(utils::version_to_number "$KUBE_VERSION") -gt $(utils::version_to_number "${stable_version}") ]];then
  2410. log::warning "[check]" "The specified version($KUBE_VERSION) is more than the stable version(${stable_version})!"
  2411. continue
  2412. fi
  2413. else
  2414. if [[ $(utils::version_to_number "${local_version}") -ge $(utils::version_to_number "${stable_version}") ]];then
  2415. log::warning "[check]" "The local version($local_version) is greater or equal to the stable version(${stable_version})!"
  2416. continue
  2417. fi
  2418. fi
  2419. command::exec "${MGMT_NODE}" "kubectl drain ${host} --ignore-daemonsets --delete-local-data"
  2420. check::exit_code "$?" "upgrade" "drain ${host} node" "exit"
  2421. sleep 5
  2422. if [[ "${skip_plan}" == "false" ]]; then
  2423. command::exec "${host}" "$(declare -f script::upgrage_kube); script::upgrage_kube 'init' '$KUBE_VERSION'"
  2424. check::exit_code "$?" "upgrade" "plan and upgrade cluster on ${host}" "exit"
  2425. command::exec "${host}" "$(declare -f utils::retry); utils::retry 10 kubectl get node"
  2426. check::exit_code "$?" "upgrade" "${host}: upgrade" "exit"
  2427. skip_plan=true
  2428. else
  2429. command::exec "${host}" "$(declare -f script::upgrage_kube); script::upgrage_kube 'node' '$KUBE_VERSION'"
  2430. check::exit_code "$?" "upgrade" "upgrade ${host} node" "exit"
  2431. fi
  2432. command::exec "${MGMT_NODE}" "kubectl wait --for=condition=Ready node/${host} --timeout=120s"
  2433. check::exit_code "$?" "upgrade" "${host} ready"
  2434. sleep 5
  2435. command::exec "${MGMT_NODE}" "$(declare -f utils::retry); utils::retry 6 kubectl uncordon ${host}"
  2436. check::exit_code "$?" "upgrade" "uncordon ${host} node"
  2437. sleep 5
  2438. done
  2439. kube::status
  2440. }
  2441. # 脚本文件更新
  2442. function update::self() {
  2443. log::info "[update]" "download kainstall script to $0"
  2444. command::exec "127.0.0.1" "
  2445. wget --timeout=10 --waitretry=3 --tries=5 --retry-connrefused https://cdn.jsdelivr.net/gh/lework/kainstall@master/kainstall-centos.sh -O /tmp/kainstall-centos.sh || exit 1
  2446. /bin/mv -fv /tmp/kainstall-centos.sh \"$0\"
  2447. chmod +x \"$0\"
  2448. "
  2449. check::exit_code "$?" "update" "kainstall script"
  2450. }
  2451. # 数据处理及限制
  2452. function transform::data() {
  2453. MASTER_NODES=$(echo "${MASTER_NODES}" | tr ',' ' ')
  2454. WORKER_NODES=$(echo "${WORKER_NODES}" | tr ',' ' ')
  2455. if ! utils::is_element_in_array "$KUBE_CRI" docker containerd cri-o ; then
  2456. log::error "[limit]" "$KUBE_CRI is not supported, only [docker,containerd,cri-o]"
  2457. exit 1
  2458. fi
  2459. [[ "$KUBE_CRI" != "docker" && "${OFFLINE_TAG:-}" == "1" ]] && { log::error "[limit]" "$KUBE_CRI is not supported offline, only docker"; exit 1; }
  2460. [[ "$KUBE_CRI" == "containerd" && "${KUBE_CRI_ENDPOINT}" == "/var/run/dockershim.sock" ]] && KUBE_CRI_ENDPOINT="unix:///run/containerd/containerd.sock"
  2461. [[ "$KUBE_CRI" == "cri-o" && "${KUBE_CRI_ENDPOINT}" == "/var/run/dockershim.sock" ]] && KUBE_CRI_ENDPOINT="unix:///var/run/crio/crio.sock"
  2462. kubelet_nodeRegistration="nodeRegistration:
  2463. criSocket: ${KUBE_CRI_ENDPOINT:-/var/run/dockershim.sock}
  2464. kubeletExtraArgs:
  2465. runtime-cgroups: /system.slice/${KUBE_CRI//-/}.service
  2466. pod-infra-container-image: ${KUBE_IMAGE_REPO}/pause:${PAUSE_VERSION:-3.6}
  2467. "
  2468. }
  2469. # 使用帮助
  2470. function help::usage() {
  2471. cat << EOF
  2472. Install kubernetes cluster using kubeadm.
  2473. Usage:
  2474. $(basename "$0") [command]
  2475. Available Commands:
  2476. init Init Kubernetes cluster.
  2477. reset Reset Kubernetes cluster.
  2478. add Add nodes to the cluster.
  2479. del Remove node from the cluster.
  2480. renew-cert Renew all available certificates.
  2481. upgrade Upgrading kubeadm clusters.
  2482. update Update script file.
  2483. Flag:
  2484. -m,--master master node, default: ''
  2485. -w,--worker work node, default: ''
  2486. -u,--user ssh user, default: ${SSH_USER}
  2487. -p,--password ssh password
  2488. --private-key ssh private key
  2489. -P,--port ssh port, default: ${SSH_PORT}
  2490. -v,--version kube version, default: ${KUBE_VERSION}
  2491. -n,--network cluster network, choose: [flannel,calico,cilium], default: ${KUBE_NETWORK}
  2492. -i,--ingress ingress controller, choose: [nginx,traefik], default: ${KUBE_INGRESS}
  2493. -ui,--ui cluster web ui, choose: [dashboard,kubesphere], default: ${KUBE_UI}
  2494. -a,--addon cluster add-ons, choose: [metrics-server,nodelocaldns], default: ${KUBE_ADDON}
  2495. -M,--monitor cluster monitor, choose: [prometheus]
  2496. -l,--log cluster log, choose: [elasticsearch]
  2497. -s,--storage cluster storage, choose: [rook,longhorn]
  2498. --cri cri tools, choose: [docker,containerd,cri-o], default: ${KUBE_CRI}
  2499. --cri-version cri version, default: ${KUBE_CRI_VERSION}
  2500. --cri-endpoint cri endpoint, default: ${KUBE_CRI_ENDPOINT}
  2501. -U,--upgrade-kernel upgrade kernel
  2502. -of,--offline-file specify the offline package file to load
  2503. --10years the certificate period is 10 years.
  2504. --sudo sudo mode
  2505. --sudo-user sudo user
  2506. --sudo-password sudo user password
  2507. Example:
  2508. [init cluster]
  2509. $0 init \\
  2510. --master 192.168.77.130,192.168.77.131,192.168.77.132 \\
  2511. --worker 192.168.77.133,192.168.77.134,192.168.77.135 \\
  2512. --user root \\
  2513. --password 123456 \\
  2514. --version 1.20.4
  2515. [reset cluster]
  2516. $0 reset \\
  2517. --user root \\
  2518. --password 123456
  2519. [add node]
  2520. $0 add \\
  2521. --master 192.168.77.140,192.168.77.141 \\
  2522. --worker 192.168.77.143,192.168.77.144 \\
  2523. --user root \\
  2524. --password 123456 \\
  2525. --version 1.20.4
  2526. [del node]
  2527. $0 del \\
  2528. --master 192.168.77.140,192.168.77.141 \\
  2529. --worker 192.168.77.143,192.168.77.144 \\
  2530. --user root \\
  2531. --password 123456
  2532. [other]
  2533. $0 renew-cert --user root --password 123456
  2534. $0 upgrade --version 1.20.4 --user root --password 123456
  2535. $0 update
  2536. $0 add --ingress traefik
  2537. $0 add --monitor prometheus
  2538. $0 add --log elasticsearch
  2539. $0 add --storage rook
  2540. $0 add --ui dashboard
  2541. $0 add --addon nodelocaldns
  2542. EOF
  2543. exit 1
  2544. }
  2545. ######################################################################################################
  2546. # main
  2547. ######################################################################################################
  2548. [ "$#" == "0" ] && help::usage
  2549. while [ "${1:-}" != "" ]; do
  2550. case $1 in
  2551. init ) INIT_TAG=1
  2552. ;;
  2553. reset ) RESET_TAG=1
  2554. ;;
  2555. add ) ADD_TAG=1
  2556. ;;
  2557. del ) DEL_TAG=1
  2558. ;;
  2559. renew-cert ) RENEW_CERT_TAG=1
  2560. ;;
  2561. upgrade ) UPGRADE_TAG=1
  2562. ;;
  2563. update ) UPDATE_TAG=1
  2564. ;;
  2565. -m | --master ) shift
  2566. MASTER_NODES=${1:-$MASTER_NODES}
  2567. ;;
  2568. -w | --worker ) shift
  2569. WORKER_NODES=${1:-$WORKER_NODES}
  2570. ;;
  2571. -u | --user ) shift
  2572. SSH_USER=${1:-$SSH_USER}
  2573. ;;
  2574. -p | --password ) shift
  2575. SSH_PASSWORD=${1:-$SSH_PASSWORD}
  2576. ;;
  2577. --private-key ) shift
  2578. SSH_PRIVATE_KEY=${1:-$SSH_SSH_PRIVATE_KEY}
  2579. ;;
  2580. -P | --port ) shift
  2581. SSH_PORT=${1:-$SSH_PORT}
  2582. ;;
  2583. -v | --version ) shift
  2584. KUBE_VERSION=${1:-$KUBE_VERSION}
  2585. ;;
  2586. -n | --network ) shift
  2587. NETWORK_TAG=1
  2588. KUBE_NETWORK=${1:-$KUBE_NETWORK}
  2589. ;;
  2590. -i | --ingress ) shift
  2591. INGRESS_TAG=1
  2592. KUBE_INGRESS=${1:-$KUBE_INGRESS}
  2593. ;;
  2594. -M | --monitor ) shift
  2595. MONITOR_TAG=1
  2596. KUBE_MONITOR=${1:-$KUBE_MONITOR}
  2597. ;;
  2598. -l | --log ) shift
  2599. LOG_TAG=1
  2600. KUBE_LOG=${1:-$KUBE_LOG}
  2601. ;;
  2602. -s | --storage ) shift
  2603. STORAGE_TAG=1
  2604. KUBE_STORAGE=${1:-$KUBE_STORAGE}
  2605. ;;
  2606. -ui | --ui ) shift
  2607. UI_TAG=1
  2608. KUBE_UI=${1:-$KUBE_UI}
  2609. ;;
  2610. -a | --addon ) shift
  2611. ADDON_TAG=1
  2612. KUBE_ADDON=${1:-$KUBE_ADDON}
  2613. ;;
  2614. --cri ) shift
  2615. KUBE_CRI=${1:-$KUBE_CRI}
  2616. ;;
  2617. --cri-version ) shift
  2618. KUBE_CRI_VERSION=${1:-$KUBE_CRI_VERSION}
  2619. ;;
  2620. --cri-endpoint ) shift
  2621. KUBE_CRI_ENDPOINT=${1:-$KUBE_CRI_ENDPOINT}
  2622. ;;
  2623. -U | --upgrade-kernel ) UPGRADE_KERNEL_TAG=1
  2624. ;;
  2625. -of | --offline-file ) shift
  2626. OFFLINE_TAG=1
  2627. OFFLINE_FILE=${1:-$OFFLINE_FILE}
  2628. ;;
  2629. --10years ) CERT_YEAR_TAG=1
  2630. ;;
  2631. --sudo ) SUDO_TAG=1
  2632. ;;
  2633. --sudo-user ) shift
  2634. SUDO_USER=${1:-$SUDO_USER}
  2635. ;;
  2636. --sudo-password ) shift
  2637. SUDO_PASSWORD=${1:-}
  2638. ;;
  2639. * ) help::usage
  2640. exit 1
  2641. esac
  2642. shift
  2643. done
  2644. # 开始
  2645. log::info "[start]" "bash $0 ${SCRIPT_PARAMETER//${SSH_PASSWORD:-${SUDO_PASSWORD:-}}/zzzzzz}"
  2646. # 数据处理
  2647. transform::data
  2648. # 预检
  2649. check::preflight
  2650. # 动作
  2651. if [[ "${INIT_TAG:-}" == "1" ]]; then
  2652. [[ "$MASTER_NODES" == "" ]] && MASTER_NODES="127.0.0.1"
  2653. init::cluster
  2654. elif [[ "${ADD_TAG:-}" == "1" ]]; then
  2655. [[ "${NETWORK_TAG:-}" == "1" ]] && { add::network; add=1; }
  2656. [[ "${INGRESS_TAG:-}" == "1" ]] && { add::ingress; add=1; }
  2657. [[ "${STORAGE_TAG:-}" == "1" ]] && { add::storage; add=1; }
  2658. [[ "${MONITOR_TAG:-}" == "1" ]] && { add::monitor; add=1; }
  2659. [[ "${LOG_TAG:-}" == "1" ]] && { add::log; add=1; }
  2660. [[ "${UI_TAG:-}" == "1" ]] && { add::ui; add=1; }
  2661. [[ "${ADDON_TAG:-}" == "1" ]] && { add::addon; add=1; }
  2662. [[ "$MASTER_NODES" != "" || "$WORKER_NODES" != "" ]] && { add::node; add=1; }
  2663. [[ "${add:-}" != "1" ]] && help::usage
  2664. elif [[ "${DEL_TAG:-}" == "1" ]]; then
  2665. if [[ "$MASTER_NODES" != "" || "$WORKER_NODES" != "" ]]; then del::node; else help::usage; fi
  2666. elif [[ "${RESET_TAG:-}" == "1" ]]; then
  2667. reset::cluster
  2668. elif [[ "${RENEW_CERT_TAG:-}" == "1" ]]; then
  2669. cert::renew
  2670. elif [[ "${UPGRADE_TAG:-}" == "1" ]]; then
  2671. upgrade::cluster
  2672. elif [[ "${UPDATE_TAG:-}" == "1" ]]; then
  2673. update::self
  2674. else
  2675. help::usage
  2676. fi
  2677. # bash <(curl -s http://git.yvanui.com/lizhiwei/jztd-deploy/raw/master/00base/06k8s-install-centos.sh) [cmd]