06k8s-install-centos.sh 99 KB


  1. #!/usr/bin/env bash
  2. # 参考 https://raw.githubusercontent.com/lework/kainstall/v1.4.9/kainstall-centos.sh
  3. [[ -n $DEBUG ]] && set -x
  4. set -o errtrace # Make sure any error trap is inherited
  5. set -o nounset # Disallow expansion of unset variables
  6. set -o pipefail # Use last non-zero exit code in a pipeline
  7. # 版本
  8. KUBE_VERSION="${KUBE_VERSION:-latest}"
  9. FLANNEL_VERSION="${FLANNEL_VERSION:-0.17.0}"
  10. CALICO_VERSION="${CALICO_VERSION:-3.22.1}"
  11. CILIUM_VERSION="${CILIUM_VERSION:-1.9.13}"
  12. #METRICS_SERVER_VERSION="${METRICS_SERVER_VERSION:-0.6.1}"
  13. #INGRESS_NGINX="${INGRESS_NGINX:-1.1.2}"
  14. #TRAEFIK_VERSION="${TRAEFIK_VERSION:-2.6.1}"
  15. #KUBE_PROMETHEUS_VERSION="${KUBE_PROMETHEUS_VERSION:-0.10.0}"
  16. #ELASTICSEARCH_VERSION="${ELASTICSEARCH_VERSION:-8.1.0}"
  17. #ROOK_VERSION="${ROOK_VERSION:-1.8.7}"
  18. #LONGHORN_VERSION="${LONGHORN_VERSION:-1.2.4}"
  19. #KUBERNETES_DASHBOARD_VERSION="${KUBERNETES_DASHBOARD_VERSION:-2.5.1}"
  20. #KUBESPHERE_VERSION="${KUBESPHERE_VERSION:-3.2.1}"
  21. # 集群配置
  22. KUBE_DNSDOMAIN="${KUBE_DNSDOMAIN:-cluster.local}"
  23. KUBE_APISERVER="${KUBE_APISERVER:-apiserver.$KUBE_DNSDOMAIN}"
  24. KUBE_POD_SUBNET="${KUBE_POD_SUBNET:-10.244.0.0/16}"
  25. KUBE_SERVICE_SUBNET="${KUBE_SERVICE_SUBNET:-10.96.0.0/16}"
  26. KUBE_IMAGE_REPO="${KUBE_IMAGE_REPO:-registry.cn-hangzhou.aliyuncs.com/kainstall}"
  27. KUBE_NETWORK="${KUBE_NETWORK:-flannel}"
  28. KUBE_INGRESS="${KUBE_INGRESS:-nginx}"
  29. KUBE_MONITOR="${KUBE_MONITOR:-prometheus}"
  30. KUBE_STORAGE="${KUBE_STORAGE:-rook}"
  31. KUBE_LOG="${KUBE_LOG:-elasticsearch}"
  32. KUBE_UI="${KUBE_UI:-dashboard}"
  33. KUBE_ADDON="${KUBE_ADDON:-metrics-server}"
  34. KUBE_FLANNEL_TYPE="${KUBE_FLANNEL_TYPE:-vxlan}"
  35. KUBE_CRI="${KUBE_CRI:-docker}"
  36. KUBE_CRI_VERSION="${KUBE_CRI_VERSION:-latest}"
  37. KUBE_CRI_ENDPOINT="${KUBE_CRI_ENDPOINT:-/var/run/dockershim.sock}"
  38. # 定义的master和worker节点地址,以逗号分隔
  39. MASTER_NODES="${MASTER_NODES:-}"
  40. WORKER_NODES="${WORKER_NODES:-}"
  41. # 定义在哪个节点上进行设置
  42. MGMT_NODE="${MGMT_NODE:-127.0.0.1}"
  43. # 节点的连接信息
  44. SSH_USER="${SSH_USER:-root}"
  45. SSH_PASSWORD="${SSH_PASSWORD:-}"
  46. SSH_PRIVATE_KEY="${SSH_PRIVATE_KEY:-}"
  47. SSH_PORT="${SSH_PORT:-22}"
  48. SUDO_USER="${SUDO_USER:-root}"
  49. # 节点设置
  50. HOSTNAME_PREFIX="${HOSTNAME_PREFIX:-k8s}"
  51. # 脚本设置
  52. TMP_DIR="$(rm -rf /tmp/kainstall* && mktemp -d -t kainstall.XXXXXXXXXX)"
  53. LOG_FILE="${TMP_DIR}/kainstall.log"
  54. SSH_OPTIONS="-o ConnectTimeout=600 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
  55. ERROR_INFO="\n\033[31mERROR Summary: \033[0m\n "
  56. ACCESS_INFO="\n\033[32mACCESS Summary: \033[0m\n "
  57. COMMAND_OUTPUT=""
  58. SCRIPT_PARAMETER="$*"
  59. OFFLINE_DIR="/tmp/kainstall-offline-file/"
  60. OFFLINE_FILE=""
  61. OS_SUPPORT="centos7 centos8"
  62. GITHUB_PROXY="${GITHUB_PROXY:-https://ghproxy.com/}"
  63. GCR_PROXY="${GCR_PROXY:-k8sgcr.lework.workers.dev}"
  64. SKIP_UPGRADE_PLAN=${SKIP_UPGRADE_PLAN:-false}
  65. SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  66. trap trap::info 1 2 3 15 EXIT
  67. ######################################################################################################
  68. # 通用函数
  69. ######################################################################################################
  70. # 信号处理
  71. function trap::info() {
  72. [[ ${#ERROR_INFO} -gt 37 ]] && echo -e "$ERROR_INFO"
  73. [[ ${#ACCESS_INFO} -gt 38 ]] && echo -e "$ACCESS_INFO"
  74. [ -f "$LOG_FILE" ] && echo -e "\n\n See detailed log >>> $LOG_FILE \n\n"
  75. trap '' EXIT
  76. exit
  77. }
  78. # 错误日志
  79. function log::error() {
  80. local item; item="[$(date +'%Y-%m-%dT%H:%M:%S.%N%z')]: \033[31mERROR: \033[0m$*"
  81. ERROR_INFO="${ERROR_INFO}${item}\n "
  82. echo -e "${item}" | tee -a "$LOG_FILE"
  83. }
  84. # 基础日志
  85. function log::info() {
  86. printf "[%s]: \033[32mINFO: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  87. }
  88. # 警告日志
  89. function log::warning() {
  90. printf "[%s]: \033[33mWARNING: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  91. }
  92. # 访问信息
  93. function log::access() {
  94. ACCESS_INFO="${ACCESS_INFO}$*\n "
  95. printf "[%s]: \033[32mINFO: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" | tee -a "$LOG_FILE"
  96. }
  97. # 执行日志
  98. function log::exec() {
  99. printf "[%s]: \033[34mEXEC: \033[0m%s\n" "$(date +'%Y-%m-%dT%H:%M:%S.%N%z')" "$*" >> "$LOG_FILE"
  100. }
  101. # 版本号转数字
  102. function utils::version_to_number() {
  103. echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }';
  104. }
  105. # 重试
  106. function utils::retry {
  107. local retries=$1
  108. shift
  109. local count=0
  110. until eval "$*"; do
  111. exit=$?
  112. wait=$((2 ** count))
  113. count=$((count + 1))
  114. if [ "$count" -lt "$retries" ]; then
  115. echo "Retry $count/$retries exited $exit, retrying in $wait seconds..."
  116. sleep $wait
  117. else
  118. echo "Retry $count/$retries exited $exit, no more retries left."
  119. return $exit
  120. fi
  121. done
  122. return 0
  123. }
  124. # 转义引号
  125. function utils::quote() {
  126. # shellcheck disable=SC2046
  127. if [ $(echo "$*" | tr -d "\n" | wc -c) -eq 0 ]; then
  128. echo "''"
  129. elif [ $(echo "$*" | tr -d "[a-z][A-Z][0-9]:,.=~_/\n-" | wc -c) -gt 0 ]; then
  130. printf "%s" "$*" | sed -e "1h;2,\$H;\$!d;g" -e "s/'/\'\"\'\"\'/g" | sed -e "1h;2,\$H;\$!d;g" -e "s/^/'/g" -e "s/$/'/g"
  131. else
  132. echo "$*"
  133. fi
  134. }
  135. # 下载文件
  136. function utils::download_file() {
  137. local url="$1"
  138. local dest="$2"
  139. local unzip_tag="${3:-1}"
  140. local dest_dirname; dest_dirname=$(dirname "$dest")
  141. local filename; filename=$(basename "$dest")
  142. log::info "[download]" "${filename}"
  143. command::exec "${MGMT_NODE}" "
  144. set -e
  145. if [ ! -f \"${dest}\" ]; then
  146. [ ! -d \"${dest_dirname}\" ] && mkdir -pv \"${dest_dirname}\"
  147. wget --timeout=10 --waitretry=3 --tries=5 --retry-connrefused --no-check-certificate \"${url}\" -O \"${dest}\"
  148. if [[ \"${unzip_tag}\" == \"unzip\" ]]; then
  149. command -v unzip 2>/dev/null || yum install -y unzip
  150. unzip -o \"${dest}\" -d \"${dest_dirname}\"
  151. fi
  152. else
  153. echo \"${dest} is exists!\"
  154. fi
  155. "
  156. local status="$?"
  157. check::exit_code "$status" "download" "${filename}" "exit"
  158. return "$status"
  159. }
  160. # 判断是否在数组中存在元素
  161. function utils::is_element_in_array() {
  162. local -r element="${1}"
  163. local -r array=("${@:2}")
  164. local walker=''
  165. for walker in "${array[@]}"
  166. do
  167. [[ "${walker}" = "${element}" ]] && return 0
  168. done
  169. return 1
  170. }
  171. # 执行命令
  172. function command::exec() {
  173. local host=${1:-}
  174. shift
  175. local command="$*"
  176. if [[ "${SUDO_TAG:-}" == "1" ]]; then
  177. sudo_options="sudo -H -n -u ${SUDO_USER}"
  178. if [[ "${SUDO_PASSWORD:-}" != "" ]]; then
  179. sudo_options="${sudo_options// -n/} -p \"\" -S <<< \"${SUDO_PASSWORD}\""
  180. fi
  181. command="$sudo_options bash -c $(utils::quote "$command")"
  182. fi
  183. command="$(utils::quote "$command")"
  184. if [[ "${host}" == "127.0.0.1" ]]; then
  185. # 本地执行
  186. log::exec "[command]" "bash -c $(printf "%s" "${command//${SUDO_PASSWORD:-}/zzzzzz}")"
  187. # shellcheck disable=SC2094
  188. COMMAND_OUTPUT=$(eval bash -c "${command}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  189. local status=$?
  190. else
  191. # 远程执行
  192. local ssh_cmd="ssh"
  193. if [[ "${SSH_PASSWORD}" != "" ]]; then
  194. ssh_cmd="sshpass -p \"${SSH_PASSWORD}\" ${ssh_cmd}"
  195. elif [[ "$SSH_PRIVATE_KEY" != "" ]]; then
  196. [ -f "${SSH_PRIVATE_KEY}" ] || { log::error "[exec]" "ssh private_key:${SSH_PRIVATE_KEY} not found."; exit 1; }
  197. ssh_cmd="${ssh_cmd} -i $SSH_PRIVATE_KEY"
  198. fi
  199. log::exec "[command]" "${ssh_cmd//${SSH_PASSWORD:-}/zzzzzz} ${SSH_OPTIONS} ${SSH_USER}@${host} -p ${SSH_PORT} bash -c $(printf "%s" "${command//${SUDO_PASSWORD:-}/zzzzzz}")"
  200. # shellcheck disable=SC2094
  201. COMMAND_OUTPUT=$(eval "${ssh_cmd} ${SSH_OPTIONS} ${SSH_USER}@${host} -p ${SSH_PORT}" bash -c '"${command}"' 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  202. local status=$?
  203. fi
  204. return $status
  205. }
  206. # 拷贝文件
  207. function command::scp() {
  208. local host=${1:-}
  209. local src=${2:-}
  210. local dest=${3:-/tmp/}
  211. if [[ "${host}" == "127.0.0.1" ]]; then
  212. local command="cp -rf ${src} ${dest}"
  213. log::exec "[command]" "bash -c \"${command}\""
  214. # shellcheck disable=SC2094
  215. COMMAND_OUTPUT=$(bash -c "${command}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  216. local status=$?
  217. else
  218. local scp_cmd="scp"
  219. if [[ "${SSH_PASSWORD}" != "" ]]; then
  220. scp_cmd="sshpass -p \"${SSH_PASSWORD}\" ${scp_cmd}"
  221. elif [[ "$SSH_PRIVATE_KEY" != "" ]]; then
  222. [ -f "${SSH_PRIVATE_KEY}" ] || { log::error "[exec]" "ssh private_key:${SSH_PRIVATE_KEY} not found."; exit 1; }
  223. scp_cmd="${scp_cmd} -i $SSH_PRIVATE_KEY"
  224. fi
  225. log::exec "[command]" "${scp_cmd} ${SSH_OPTIONS} -P ${SSH_PORT} -r ${src} ${SSH_USER}@${host}:${dest}" >> "$LOG_FILE"
  226. # shellcheck disable=SC2094
  227. COMMAND_OUTPUT=$(eval "${scp_cmd} ${SSH_OPTIONS} -P ${SSH_PORT} -r ${src} ${SSH_USER}@${host}:${dest}" 2>> "$LOG_FILE" | tee -a "$LOG_FILE")
  228. local status=$?
  229. fi
  230. return $status
  231. }
  232. # 检查命令是否存在
  233. function check::command_exists() {
  234. local cmd=${1}
  235. local package=${2}
  236. if command -V "$cmd" > /dev/null 2>&1; then
  237. log::info "[check]" "$cmd command exists."
  238. else
  239. log::warning "[check]" "I require $cmd but it's not installed."
  240. log::warning "[check]" "install $package package."
  241. command::exec "127.0.0.1" "yum install -y ${package}"
  242. check::exit_code "$?" "check" "$package install" "exit"
  243. fi
  244. }
  245. ######################################################################################################
  246. # 安装函数
  247. ######################################################################################################
  248. # 节点初始化脚本
  249. function script::init_node() {
  250. # clean
  251. sed -i -e "/$KUBE_APISERVER/d" -e '/-worker-/d' -e '/-master-/d' /etc/hosts
  252. sed -i '/## Kainstall managed start/,/## Kainstall managed end/d' /etc/security/limits.conf /etc/systemd/system.conf /etc/bashrc /etc/rc.local /etc/audit/rules.d/audit.rules
  253. # Disable selinux
  254. sed -i '/SELINUX/s/enforcing/disabled/' /etc/selinux/config
  255. setenforce 0
  256. # Disable swap
  257. swapoff -a && sysctl -w vm.swappiness=0
  258. sed -ri '/^[^#]*swap/s@^@#@' /etc/fstab
  259. # Disable firewalld
  260. for target in firewalld python-firewall firewalld-filesystem iptables; do
  261. systemctl stop $target &>/dev/null || true
  262. systemctl disable $target &>/dev/null || true
  263. done
  264. # repo
  265. [[ -f /etc/yum.repos.d/CentOS-Base.repo && "${SKIP_SET_OS_REPO,,}" == "false" ]] && sed -e 's!^#baseurl=!baseurl=!g' \
  266. -e 's!^mirrorlist=!#mirrorlist=!g' \
  267. -e 's!mirror.centos.org!mirrors.aliyun.com!g' \
  268. -i /etc/yum.repos.d/CentOS-Base.repo
  269. [[ "${OFFLINE_TAG:-}" != "1" && "${SKIP_SET_OS_REPO,,}" == "false" ]] && yum install -y epel-release
  270. [[ -f /etc/yum.repos.d/epel.repo && "${SKIP_SET_OS_REPO,,}" == "false" ]] && sed -e 's!^mirrorlist=!#mirrorlist=!g' \
  271. -e 's!^metalink=!#metalink=!g' \
  272. -e 's!^#baseurl=!baseurl=!g' \
  273. -e 's!//download.*/pub!//mirrors.aliyun.com!g' \
  274. -e 's!http://mirrors\.aliyun!https://mirrors.aliyun!g' \
  275. -i /etc/yum.repos.d/epel.repo
  276. # Change limits
  277. [ ! -f /etc/security/limits.conf_bak ] && cp /etc/security/limits.conf{,_bak}
  278. cat << EOF >> /etc/security/limits.conf
  279. ## Kainstall managed start
  280. root soft nofile 655360
  281. root hard nofile 655360
  282. root soft nproc 655360
  283. root hard nproc 655360
  284. root soft core unlimited
  285. root hard core unlimited
  286. * soft nofile 655360
  287. * hard nofile 655360
  288. * soft nproc 655360
  289. * hard nproc 655360
  290. * soft core unlimited
  291. * hard core unlimited
  292. ## Kainstall managed end
  293. EOF
  294. # /etc/systemd/system.conf
  295. [ -f /etc/security/limits.d/20-nproc.conf ] && sed -i 's#4096#655360#g' /etc/security/limits.d/20-nproc.conf
  296. cat << EOF >> /etc/systemd/system.conf
  297. ## Kainstall managed start
  298. DefaultLimitCORE=infinity
  299. DefaultLimitNOFILE=655360
  300. DefaultLimitNPROC=655360
  301. DefaultTasksMax=75%
  302. ## Kainstall managed end
  303. EOF
  304. # Change sysctl
  305. cat << EOF > /etc/sysctl.d/99-kube.conf
  306. # https://www.kernel.org/doc/Documentation/sysctl/
  307. #############################################################################################
  308. # 调整虚拟内存
  309. #############################################################################################
  310. # Default: 30
  311. # 0 - 任何情况下都不使用swap。
  312. # 1 - 除非内存不足(OOM),否则不使用swap。
  313. vm.swappiness = 0
  314. # 内存分配策略
  315. #0 - 表示内核将检查是否有足够的可用内存供应用进程使用;如果有足够的可用内存,内存申请允许;否则,内存申请失败,并把错误返回给应用进程。
  316. #1 - 表示内核允许分配所有的物理内存,而不管当前的内存状态如何。
  317. #2 - 表示内核允许分配超过所有物理内存和交换空间总和的内存
  318. vm.overcommit_memory=1
  319. # OOM时处理
  320. # 1关闭,等于0时,表示当内存耗尽时,内核会触发OOM killer杀掉最耗内存的进程。
  321. vm.panic_on_oom=0
  322. # vm.dirty_background_ratio 用于调整内核如何处理必须刷新到磁盘的脏页。
  323. # Default value is 10.
  324. # 该值是系统内存总量的百分比,在许多情况下将此值设置为5是合适的。
  325. # 此设置不应设置为零。
  326. vm.dirty_background_ratio = 5
  327. # 内核强制同步操作将其刷新到磁盘之前允许的脏页总数
  328. # 也可以通过更改 vm.dirty_ratio 的值(将其增加到默认值30以上(也占系统内存的百分比))来增加
  329. # 推荐 vm.dirty_ratio 的值在60到80之间。
  330. vm.dirty_ratio = 60
  331. # vm.max_map_count 计算当前的内存映射文件数。
  332. # mmap 限制(vm.max_map_count)的最小值是打开文件的ulimit数量(cat /proc/sys/fs/file-max)。
  333. # 每128KB系统内存 map_count应该大约为1。 因此,在32GB系统上,max_map_count为262144。
  334. # Default: 65530
  335. vm.max_map_count = 2097152
  336. #############################################################################################
  337. # 调整文件
  338. #############################################################################################
  339. fs.may_detach_mounts = 1
  340. # 增加文件句柄和inode缓存的大小,并限制核心转储。
  341. fs.file-max = 2097152
  342. fs.nr_open = 2097152
  343. fs.suid_dumpable = 0
  344. # 文件监控
  345. fs.inotify.max_user_instances=8192
  346. fs.inotify.max_user_watches=524288
  347. fs.inotify.max_queued_events=16384
  348. #############################################################################################
  349. # 调整网络设置
  350. #############################################################################################
  351. # 为每个套接字的发送和接收缓冲区分配的默认内存量。
  352. net.core.wmem_default = 25165824
  353. net.core.rmem_default = 25165824
  354. # 为每个套接字的发送和接收缓冲区分配的最大内存量。
  355. net.core.wmem_max = 25165824
  356. net.core.rmem_max = 25165824
  357. # 除了套接字设置外,发送和接收缓冲区的大小
  358. # 必须使用net.ipv4.tcp_wmem和net.ipv4.tcp_rmem参数分别设置TCP套接字。
  359. # 使用三个以空格分隔的整数设置这些整数,分别指定最小,默认和最大大小。
  360. # 最大大小不能大于使用net.core.wmem_max和net.core.rmem_max为所有套接字指定的值。
  361. # 合理的设置是最小4KiB,默认64KiB和最大2MiB缓冲区。
  362. net.ipv4.tcp_wmem = 20480 12582912 25165824
  363. net.ipv4.tcp_rmem = 20480 12582912 25165824
  364. # 增加最大可分配的总缓冲区空间
  365. # 以页为单位(4096字节)进行度量
  366. net.ipv4.tcp_mem = 65536 25165824 262144
  367. net.ipv4.udp_mem = 65536 25165824 262144
  368. # 为每个套接字的发送和接收缓冲区分配的最小内存量。
  369. net.ipv4.udp_wmem_min = 16384
  370. net.ipv4.udp_rmem_min = 16384
  371. # 启用TCP窗口缩放,客户端可以更有效地传输数据,并允许在代理方缓冲该数据。
  372. net.ipv4.tcp_window_scaling = 1
  373. # 提高同时接受连接数。
  374. net.ipv4.tcp_max_syn_backlog = 10240
  375. # 将net.core.netdev_max_backlog的值增加到大于默认值1000
  376. # 可以帮助突发网络流量,特别是在使用数千兆位网络连接速度时,
  377. # 通过允许更多的数据包排队等待内核处理它们。
  378. net.core.netdev_max_backlog = 65536
  379. # 增加选项内存缓冲区的最大数量
  380. net.core.optmem_max = 25165824
  381. # 被动TCP连接的SYNACK次数。
  382. net.ipv4.tcp_synack_retries = 2
  383. # 允许的本地端口范围。
  384. net.ipv4.ip_local_port_range = 2048 65535
  385. # 防止TCP时间等待
  386. # Default: net.ipv4.tcp_rfc1337 = 0
  387. net.ipv4.tcp_rfc1337 = 1
  388. # 减少tcp_fin_timeout连接的时间默认值
  389. net.ipv4.tcp_fin_timeout = 15
  390. # 积压套接字的最大数量。
  391. # Default is 128.
  392. net.core.somaxconn = 32768
  393. # 打开syncookies以进行SYN洪水攻击保护。
  394. net.ipv4.tcp_syncookies = 1
  395. # 避免Smurf攻击
  396. # 发送伪装的ICMP数据包,目的地址设为某个网络的广播地址,源地址设为要攻击的目的主机,
  397. # 使所有收到此ICMP数据包的主机都将对目的主机发出一个回应,使被攻击主机在某一段时间内收到成千上万的数据包
  398. net.ipv4.icmp_echo_ignore_broadcasts = 1
  399. # 为icmp错误消息打开保护
  400. net.ipv4.icmp_ignore_bogus_error_responses = 1
  401. # 启用自动缩放窗口。
  402. # 如果延迟证明合理,这将允许TCP缓冲区超过其通常的最大值64K。
  403. net.ipv4.tcp_window_scaling = 1
  404. # 打开并记录欺骗,源路由和重定向数据包
  405. net.ipv4.conf.all.log_martians = 1
  406. net.ipv4.conf.default.log_martians = 1
  407. # 告诉内核有多少个未附加的TCP套接字维护用户文件句柄。 万一超过这个数字,
  408. # 孤立的连接会立即重置,并显示警告。
  409. # Default: net.ipv4.tcp_max_orphans = 65536
  410. net.ipv4.tcp_max_orphans = 65536
  411. # 不要在关闭连接时缓存指标
  412. net.ipv4.tcp_no_metrics_save = 1
  413. # 启用RFC1323中定义的时间戳记:
  414. # Default: net.ipv4.tcp_timestamps = 1
  415. net.ipv4.tcp_timestamps = 1
  416. # 启用选择确认。
  417. # Default: net.ipv4.tcp_sack = 1
  418. net.ipv4.tcp_sack = 1
  419. # 增加 tcp-time-wait 存储桶池大小,以防止简单的DOS攻击。
  420. # net.ipv4.tcp_tw_recycle 已从Linux 4.12中删除。请改用net.ipv4.tcp_tw_reuse。
  421. net.ipv4.tcp_max_tw_buckets = 14400
  422. net.ipv4.tcp_tw_reuse = 1
  423. # accept_source_route 选项使网络接口接受设置了严格源路由(SSR)或松散源路由(LSR)选项的数据包。
  424. # 以下设置将丢弃设置了SSR或LSR选项的数据包。
  425. net.ipv4.conf.all.accept_source_route = 0
  426. net.ipv4.conf.default.accept_source_route = 0
  427. # 打开反向路径过滤
  428. net.ipv4.conf.all.rp_filter = 1
  429. net.ipv4.conf.default.rp_filter = 1
  430. # 禁用ICMP重定向接受
  431. net.ipv4.conf.all.accept_redirects = 0
  432. net.ipv4.conf.default.accept_redirects = 0
  433. net.ipv4.conf.all.secure_redirects = 0
  434. net.ipv4.conf.default.secure_redirects = 0
  435. # 禁止发送所有IPv4 ICMP重定向数据包。
  436. net.ipv4.conf.all.send_redirects = 0
  437. net.ipv4.conf.default.send_redirects = 0
  438. # 开启IP转发.
  439. net.ipv4.ip_forward = 1
  440. # 禁止IPv6
  441. net.ipv6.conf.lo.disable_ipv6=1
  442. net.ipv6.conf.all.disable_ipv6 = 1
  443. net.ipv6.conf.default.disable_ipv6 = 1
  444. # 要求iptables不对bridge的数据进行处理
  445. net.bridge.bridge-nf-call-ip6tables = 1
  446. net.bridge.bridge-nf-call-iptables = 1
  447. net.bridge.bridge-nf-call-arptables = 1
  448. # arp缓存
  449. # 存在于 ARP 高速缓存中的最少层数,如果少于这个数,垃圾收集器将不会运行。缺省值是 128
  450. net.ipv4.neigh.default.gc_thresh1=2048
  451. # 保存在 ARP 高速缓存中的最多的记录软限制。垃圾收集器在开始收集前,允许记录数超过这个数字 5 秒。缺省值是 512
  452. net.ipv4.neigh.default.gc_thresh2=4096
  453. # 保存在 ARP 高速缓存中的最多记录的硬限制,一旦高速缓存中的数目高于此,垃圾收集器将马上运行。缺省值是 1024
  454. net.ipv4.neigh.default.gc_thresh3=8192
  455. # 持久连接
  456. net.ipv4.tcp_keepalive_time = 600
  457. net.ipv4.tcp_keepalive_intvl = 30
  458. net.ipv4.tcp_keepalive_probes = 10
  459. # conntrack表
  460. net.nf_conntrack_max=1048576
  461. net.netfilter.nf_conntrack_max=1048576
  462. net.netfilter.nf_conntrack_buckets=262144
  463. net.netfilter.nf_conntrack_tcp_timeout_fin_wait=30
  464. net.netfilter.nf_conntrack_tcp_timeout_time_wait=30
  465. net.netfilter.nf_conntrack_tcp_timeout_close_wait=15
  466. net.netfilter.nf_conntrack_tcp_timeout_established=300
  467. #############################################################################################
  468. # 调整内核参数
  469. #############################################################################################
  470. # 地址空间布局随机化(ASLR)是一种用于操作系统的内存保护过程,可防止缓冲区溢出攻击。
  471. # 这有助于确保与系统上正在运行的进程相关联的内存地址不可预测,
  472. # 因此,与这些流程相关的缺陷或漏洞将更加难以利用。
  473. # Accepted values: 0 = 关闭, 1 = 保守随机化, 2 = 完全随机化
  474. kernel.randomize_va_space = 2
  475. # 调高 PID 数量
  476. kernel.pid_max = 65536
  477. kernel.threads-max=30938
  478. # coredump
  479. kernel.core_pattern=core
  480. # 决定了检测到soft lockup时是否自动panic,缺省值是0
  481. kernel.softlockup_all_cpu_backtrace=1
  482. kernel.softlockup_panic=1
  483. EOF
  484. # history
  485. cat << EOF >> /etc/bashrc
  486. ## Kainstall managed start
  487. # history actions record,include action time, user, login ip
  488. HISTFILESIZE=5000
  489. HISTSIZE=5000
  490. USER_IP=\$(who -u am i 2>/dev/null | awk '{print \$NF}' | sed -e 's/[()]//g')
  491. if [ -z \$USER_IP ]
  492. then
  493. USER_IP=\$(hostname -i)
  494. fi
  495. HISTTIMEFORMAT="%Y-%m-%d %H:%M:%S \$USER_IP:\$(whoami) "
  496. export HISTFILESIZE HISTSIZE HISTTIMEFORMAT
  497. # PS1
  498. PS1='\[\033[0m\]\[\033[1;36m\][\u\[\033[0m\]@\[\033[1;32m\]\h\[\033[0m\] \[\033[1;31m\]\w\[\033[0m\]\[\033[1;36m\]]\[\033[33;1m\]\\$ \[\033[0m\]'
  499. ## Kainstall managed end
  500. EOF
  501. # journal
  502. mkdir -p /var/log/journal /etc/systemd/journald.conf.d
  503. cat << EOF > /etc/systemd/journald.conf.d/99-prophet.conf
  504. [Journal]
  505. # 持久化保存到磁盘
  506. Storage=persistent
  507. # 压缩历史日志
  508. Compress=yes
  509. SyncIntervalSec=5m
  510. RateLimitInterval=30s
  511. RateLimitBurst=1000
  512. # 最大占用空间 2G
  513. SystemMaxUse=2G
  514. # 单日志文件最大 100M
  515. SystemMaxFileSize=100M
  516. # 日志保存时间 3 周
  517. MaxRetentionSec=3week
  518. # 不将日志转发到 syslog
  519. ForwardToSyslog=no
  520. EOF
  521. # motd
  522. cat << EOF > /etc/profile.d/zz-ssh-login-info.sh
  523. #!/bin/sh
  524. #
  525. # @Time : 2020-02-04
  526. # @Author : lework
  527. # @Desc : ssh login banner
  528. export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:\$PATH
  529. #shopt -q login_shell && : || return 0
  530. # os
  531. upSeconds="\$(cut -d. -f1 /proc/uptime)"
  532. secs=\$((\${upSeconds}%60))
  533. mins=\$((\${upSeconds}/60%60))
  534. hours=\$((\${upSeconds}/3600%24))
  535. days=\$((\${upSeconds}/86400))
  536. UPTIME_INFO=\$(printf "%d days, %02dh %02dm %02ds" "\$days" "\$hours" "\$mins" "\$secs")
  537. if [ -f /etc/redhat-release ] ; then
  538. PRETTY_NAME=\$(< /etc/redhat-release)
  539. elif [ -f /etc/debian_version ]; then
  540. DIST_VER=\$(</etc/debian_version)
  541. PRETTY_NAME="\$(grep PRETTY_NAME /etc/os-release | sed -e 's/PRETTY_NAME=//g' -e 's/"//g') (\$DIST_VER)"
  542. else
  543. PRETTY_NAME=\$(cat /etc/*-release | grep "PRETTY_NAME" | sed -e 's/PRETTY_NAME=//g' -e 's/"//g')
  544. fi
  545. if [[ -d "/system/app/" && -d "/system/priv-app" ]]; then
  546. model="\$(getprop ro.product.brand) \$(getprop ro.product.model)"
  547. elif [[ -f /sys/devices/virtual/dmi/id/product_name ||
  548. -f /sys/devices/virtual/dmi/id/product_version ]]; then
  549. model="\$(< /sys/devices/virtual/dmi/id/product_name)"
  550. model+=" \$(< /sys/devices/virtual/dmi/id/product_version)"
  551. elif [[ -f /sys/firmware/devicetree/base/model ]]; then
  552. model="\$(< /sys/firmware/devicetree/base/model)"
  553. elif [[ -f /tmp/sysinfo/model ]]; then
  554. model="\$(< /tmp/sysinfo/model)"
  555. fi
  556. MODEL_INFO=\${model}
  557. KERNEL=\$(uname -srmo)
  558. USER_NUM=\$(who -u | wc -l)
  559. RUNNING=\$(ps ax | wc -l | tr -d " ")
  560. # disk
  561. totaldisk=\$(df -h -x devtmpfs -x tmpfs -x debugfs -x aufs -x overlay --total 2>/dev/null | tail -1)
  562. disktotal=\$(awk '{print \$2}' <<< "\${totaldisk}")
  563. diskused=\$(awk '{print \$3}' <<< "\${totaldisk}")
  564. diskusedper=\$(awk '{print \$5}' <<< "\${totaldisk}")
  565. DISK_INFO="\033[0;33m\${diskused}\033[0m of \033[1;34m\${disktotal}\033[0m disk space used (\033[0;33m\${diskusedper}\033[0m)"
  566. # cpu
  567. cpu=\$(awk -F':' '/^model name/ {print \$2}' /proc/cpuinfo | uniq | sed -e 's/^[ \t]*//')
  568. cpun=\$(grep -c '^processor' /proc/cpuinfo)
  569. cpuc=\$(grep '^cpu cores' /proc/cpuinfo | tail -1 | awk '{print \$4}')
  570. cpup=\$(grep '^physical id' /proc/cpuinfo | wc -l)
  571. CPU_INFO="\${cpu} \${cpup}P \${cpuc}C \${cpun}L"
  572. # get the load averages
  573. read one five fifteen rest < /proc/loadavg
  574. LOADAVG_INFO="\033[0;33m\${one}\033[0m / \${five} / \${fifteen} with \033[1;34m\$(( cpun*cpuc ))\033[0m core(s) at \033[1;34m\$(grep '^cpu MHz' /proc/cpuinfo | tail -1 | awk '{print \$4}')\033 MHz"
  575. # mem
  576. MEM_INFO="\$(cat /proc/meminfo | awk '/MemTotal:/{total=\$2/1024/1024;next} /MemAvailable:/{use=total-\$2/1024/1024; printf("\033[0;33m%.2fGiB\033[0m of \033[1;34m%.2fGiB\033[0m RAM used (\033[0;33m%.2f%%\033[0m)",use,total,(use/total)*100);}')"
  577. # network
  578. # extranet_ip=" and \$(curl -s ip.cip.cc)"
  579. IP_INFO="\$(ip a|grep -E '^[0-9]+: em*|^[0-9]+: eno*|^[0-9]+: enp*|^[0-9]+: ens*|^[0-9]+: eth*|^[0-9]+: wlp*' -A2|grep inet|awk -F ' ' '{print $2}'|cut -f1 -d/|xargs echo)"
  580. # Container info
  581. CONTAINER_INFO="\$(sudo /usr/bin/crictl ps -a -o yaml 2> /dev/null | awk '/^ state: /{gsub("CONTAINER_", "", \$NF) ++S[\$NF]}END{for(m in S) printf "%s%s:%s ",substr(m,1,1),tolower(substr(m,2)),S[m]}')Images:\$(sudo /usr/bin/crictl images -q 2> /dev/null | wc -l)"
  582. # info
  583. echo -e "
  584. Information as of: \033[1;34m\$(date +"%Y-%m-%d %T")\033[0m
  585. \033[0;1;31mProduct\033[0m............: \${MODEL_INFO}
  586. \033[0;1;31mOS\033[0m.................: \${PRETTY_NAME}
  587. \033[0;1;31mKernel\033[0m.............: \${KERNEL}
  588. \033[0;1;31mCPU\033[0m................: \${CPU_INFO}
  589. \033[0;1;31mHostname\033[0m...........: \033[1;34m\$(hostname)\033[0m
  590. \033[0;1;31mIP Addresses\033[0m.......: \033[1;34m\${IP_INFO}\033[0m
  591. \033[0;1;31mUptime\033[0m.............: \033[0;33m\${UPTIME_INFO}\033[0m
  592. \033[0;1;31mMemory\033[0m.............: \${MEM_INFO}
  593. \033[0;1;31mLoad Averages\033[0m......: \${LOADAVG_INFO}
  594. \033[0;1;31mDisk Usage\033[0m.........: \${DISK_INFO}
  595. \033[0;1;31mUsers online\033[0m.......: \033[1;34m\${USER_NUM}\033[0m
  596. \033[0;1;31mRunning Processes\033[0m..: \033[1;34m\${RUNNING}\033[0m
  597. \033[0;1;31mContainer Info\033[0m.....: \${CONTAINER_INFO}
  598. "
  599. EOF
  600. chmod +x /etc/profile.d/zz-ssh-login-info.sh
  601. echo 'ALL ALL=(ALL) NOPASSWD:/usr/bin/crictl' > /etc/sudoers.d/crictl
  602. # time sync
  603. ntpd --help >/dev/null 2>&1 && yum remove -y ntp
  604. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y chrony
  605. [ ! -f /etc/chrony.conf_bak ] && cp /etc/chrony.conf{,_bak} #备份默认配置
  606. cat << EOF > /etc/chrony.conf
  607. server ntp.aliyun.com iburst
  608. server cn.ntp.org.cn iburst
  609. server ntp.shu.edu.cn iburst
  610. server 0.cn.pool.ntp.org iburst
  611. server 1.cn.pool.ntp.org iburst
  612. server 2.cn.pool.ntp.org iburst
  613. server 3.cn.pool.ntp.org iburst
  614. driftfile /var/lib/chrony/drift
  615. makestep 1.0 3
  616. logdir /var/log/chrony
  617. EOF
  618. timedatectl set-timezone Asia/Shanghai
  619. chronyd -q -t 1 'server cn.pool.ntp.org iburst maxsamples 1'
  620. systemctl enable chronyd
  621. systemctl start chronyd
  622. chronyc sources -v
  623. chronyc sourcestats
  624. hwclock --systohc
  625. # package
  626. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y curl wget
  627. # ipvs
  628. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y ipvsadm ipset sysstat conntrack libseccomp
  629. module=(
  630. ip_vs
  631. ip_vs_rr
  632. ip_vs_wrr
  633. ip_vs_sh
  634. overlay
  635. nf_conntrack
  636. br_netfilter
  637. )
  638. [ -f /etc/modules-load.d/ipvs.conf ] && cp -f /etc/modules-load.d/ipvs.conf{,_bak}
  639. for kernel_module in "${module[@]}";do
  640. /sbin/modinfo -F filename "$kernel_module" |& grep -qv ERROR && echo "$kernel_module" >> /etc/modules-load.d/ipvs.conf
  641. done
  642. systemctl restart systemd-modules-load
  643. systemctl enable systemd-modules-load
  644. sysctl --system
  645. # audit
  646. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y audit audit-libs
  647. # /etc/audit/rules.d/audit.rules
  648. cat << EOF >> /etc/audit/rules.d/audit.rules
  649. ## Kainstall managed start
  650. # Ignore errors
  651. -i
  652. # SYSCALL
  653. -a always,exit -F arch=b64 -S kill,tkill,tgkill -F a1=9 -F key=trace_kill_9
  654. -a always,exit -F arch=b64 -S kill,tkill,tgkill -F a1=15 -F key=trace_kill_15
  655. # docker
  656. -w /usr/bin/dockerd -k docker
  657. -w /var/lib/docker -k docker
  658. -w /etc/docker -k docker
  659. -w /usr/lib/systemd/system/docker.service -k docker
  660. -w /etc/systemd/system/docker.service -k docker
  661. -w /usr/lib/systemd/system/docker.socket -k docker
  662. -w /etc/default/docker -k docker
  663. -w /etc/sysconfig/docker -k docker
  664. -w /etc/docker/daemon.json -k docker
  665. # containerd
  666. -w /usr/bin/containerd -k containerd
  667. -w /var/lib/containerd -k containerd
  668. -w /usr/lib/systemd/system/containerd.service -k containerd
  669. -w /etc/containerd/config.toml -k containerd
  670. # cri-o
  671. -w /usr/bin/crio -k cri-o
  672. -w /etc/crio -k cri-o
  673. # runc
  674. -w /usr/bin/runc -k runc
  675. # kube
  676. -w /usr/bin/kubeadm -k kubeadm
  677. -w /usr/bin/kubelet -k kubelet
  678. -w /usr/bin/kubectl -k kubectl
  679. -w /var/lib/kubelet -k kubelet
  680. -w /etc/kubernetes -k kubernetes
  681. ## Kainstall managed end
  682. EOF
  683. chmod 600 /etc/audit/rules.d/audit.rules
  684. sed -i 's#max_log_file =.*#max_log_file = 80#g' /etc/audit/auditd.conf
  685. if [ -f /usr/libexec/initscripts/legacy-actions/auditd/restart ]; then
  686. /usr/libexec/initscripts/legacy-actions/auditd/restart
  687. else
  688. systemctl stop auditd && systemctl start auditd
  689. fi
  690. systemctl enable auditd
  691. grep single-request-reopen /etc/resolv.conf || sed -i '1ioptions timeout:2 attempts:3 rotate single-request-reopen' /etc/resolv.conf
  692. ipvsadm --clear
  693. iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
  694. }
  695. # 升级内核
  696. function script::upgrade_kernel() {
  697. local ver; ver=$(rpm --eval "%{centos_ver}")
  698. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y "https://www.elrepo.org/elrepo-release-${ver}.el${ver}.elrepo.noarch.rpm"
  699. sed -e "s/^mirrorlist=/#mirrorlist=/g" \
  700. -e "s/elrepo.org\/linux/mirrors.tuna.tsinghua.edu.cn\/elrepo/g" \
  701. -i /etc/yum.repos.d/elrepo.repo
  702. [[ "${OFFLINE_TAG:-}" != "1" ]] && yum install -y --disablerepo="*" --enablerepo=elrepo-kernel kernel-lt{,-devel}
  703. grub2-set-default 0 && grub2-mkconfig -o /etc/grub2.cfg
  704. grubby --default-kernel
  705. grubby --args="user_namespace.enable=1" --update-kernel="$(grubby --default-kernel)"
  706. }
  707. # 节点软件升级
  708. function script::upgrage_kube() {
  709. local role=${1:-init}
  710. local version="-${2:-latest}"
  711. version="${version#-latest}"
  712. set -e
  713. echo '[install] kubeadm'
  714. kubeadm version
  715. yum install -y "kubeadm${version}" --disableexcludes=kubernetes
  716. kubeadm version
  717. echo '[upgrade]'
  718. if [[ "$role" == "init" ]]; then
  719. local plan_info; plan_info=$(kubeadm upgrade plan)
  720. local v; v=$(printf "%s" "$plan_info" | grep 'kubeadm upgrade apply ' | awk '{print $4}'| tail -1 )
  721. printf "%s\n" "${plan_info}"
  722. kubeadm upgrade apply "${v}" -y
  723. else
  724. kubeadm upgrade node
  725. fi
  726. echo '[install] kubelet kubectl'
  727. kubectl version --client=true
  728. yum install -y "kubelet${version}" "kubectl${version}" --disableexcludes=kubernetes
  729. kubectl version --client=true
  730. [ -f /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf ] && \
  731. sed -i 's#^\[Service\]#[Service]\nCPUAccounting=true\nMemoryAccounting=true#g' /usr/lib/systemd/system/kubelet.service.d/10-kubeadm.conf
  732. systemctl daemon-reload
  733. systemctl restart kubelet
  734. }
  735. # 安装 docker
  736. function script::install_docker() {
  737. local version="-${1:-latest}"
  738. version="${version#-latest}"
  739. cat << EOF > /etc/yum.repos.d/docker-ce.repo
  740. [docker-ce-stable]
  741. name=Docker CE Stable - \$basearch
  742. baseurl=https://mirrors.aliyun.com/docker-ce/linux/centos/$(rpm --eval '%{centos_ver}')/\$basearch/stable
  743. enabled=1
  744. gpgcheck=1
  745. gpgkey=https://mirrors.aliyun.com/docker-ce/linux/centos/gpg
  746. EOF
  747. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  748. [ -f "$(which docker)" ] && yum remove -y docker-ce docker-ce-cli containerd.io
  749. yum install -y "docker-ce${version}" "docker-ce-cli${version}" containerd.io bash-completion
  750. fi
  751. [ -f /usr/share/bash-completion/completions/docker ] && \
  752. cp -f /usr/share/bash-completion/completions/docker /etc/bash_completion.d/
  753. [ ! -d /etc/docker ] && mkdir /etc/docker
  754. # /etc/docker/daemon.json
  755. cat << EOF > /etc/docker/daemon.json
  756. {
  757. "data-root": "/var/lib/docker",
  758. "log-driver": "json-file",
  759. "log-opts": {
  760. "max-size": "100m",
  761. "max-file": "3"
  762. },
  763. "default-ulimits": {
  764. "nofile": {
  765. "Name": "nofile",
  766. "Hard": 655360,
  767. "Soft": 655360
  768. },
  769. "nproc": {
  770. "Name": "nproc",
  771. "Hard": 655360,
  772. "Soft": 655360
  773. }
  774. },
  775. "live-restore": true,
  776. "oom-score-adjust": -1000,
  777. "max-concurrent-downloads": 10,
  778. "max-concurrent-uploads": 10,
  779. "storage-driver": "overlay2",
  780. "storage-opts": ["overlay2.override_kernel_check=true"],
  781. "exec-opts": ["native.cgroupdriver=systemd"],
  782. "registry-mirrors": [
  783. "https://573d5l8e.mirror.aliyuncs.com"
  784. ]
  785. }
  786. EOF
  787. sed -i 's|#oom_score = 0|oom_score = -999|' /etc/containerd/config.toml
  788. # /etc/crictl.yaml
  789. cat << EOF > /etc/crictl.yaml
  790. runtime-endpoint: unix:///var/run/dockershim.sock
  791. image-endpoint: unix:///var/run/dockershim.sock
  792. timeout: 2
  793. debug: false
  794. pull-image-on-create: true
  795. disable-pull-on-run: false
  796. EOF
  797. systemctl enable containerd
  798. systemctl restart containerd
  799. systemctl enable docker
  800. systemctl restart docker
  801. }
  802. # 安装 containerd
  803. function script::install_containerd() {
  804. local version="-${1:-latest}"
  805. version="${version#-latest}"
  806. # /etc/yum.repos.d/docker-ce.repo
  807. cat << EOF > /etc/yum.repos.d/docker-ce.repo
  808. [docker-ce-stable]
  809. name=Docker CE Stable - \$basearch
  810. baseurl=https://mirrors.aliyun.com/docker-ce/linux/centos/$(rpm --eval '%{centos_ver}')/\$basearch/stable
  811. enabled=1
  812. gpgcheck=1
  813. gpgkey=https://mirrors.aliyun.com/docker-ce/linux/centos/gpg
  814. EOF
  815. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  816. [ -f "$(which runc)" ] && yum remove -y runc
  817. [ -f "$(which containerd)" ] && yum remove -y containerd.io
  818. yum install -y containerd.io"${version}" containernetworking bash-completion
  819. fi
  820. [ -d /etc/bash_completion.d ] && crictl completion bash > /etc/bash_completion.d/crictl
  821. containerd config default > /etc/containerd/config.toml
  822. sed -i -e "s#k8s.gcr.io#registry.cn-hangzhou.aliyuncs.com/kainstall#g" \
  823. -e "s#https://registry-1.docker.io#https://573d5l8e.mirror.aliyuncs.com#g" \
  824. -e "s#SystemdCgroup = false#SystemdCgroup = true#g" \
  825. -e "s#oom_score = 0#oom_score = -999#" \
  826. -e "s#max_concurrent_downloads = 3#max_concurrent_downloads = 10#g" /etc/containerd/config.toml
  827. grep docker.io /etc/containerd/config.toml || sed -i -e "/registry.mirrors]/a\ \ \ \ \ \ \ \ [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n endpoint = [\"https://573d5l8e.mirror.aliyuncs.com\"]" \
  828. /etc/containerd/config.toml
  829. # /etc/crictl.yaml
  830. cat << EOF > /etc/crictl.yaml
  831. runtime-endpoint: unix:///run/containerd/containerd.sock
  832. image-endpoint: unix:///run/containerd/containerd.sock
  833. timeout: 2
  834. debug: false
  835. pull-image-on-create: true
  836. disable-pull-on-run: false
  837. EOF
  838. systemctl restart containerd
  839. systemctl enable containerd
  840. }
  841. # 安装 cri-o
  842. function script::install_cri-o() {
  843. local version="${1:-latest}"
  844. version="${version##latest}"
  845. os="CentOS_$(rpm --eval '%{centos_ver}')" && echo "${os}"
  846. # /etc/yum.repos.d/devel_kubic_libcontainers_stable.repo
  847. cat << EOF > /etc/yum.repos.d/devel_kubic_libcontainers_stable.repo
  848. [devel_kubic_libcontainers_stable]
  849. name=Stable Releases of Upstream github.com/containers packages
  850. type=rpm-md
  851. baseurl=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/${os}/
  852. gpgcheck=1
  853. gpgkey=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/${os}/repodata/repomd.xml.key
  854. enabled=1
  855. [devel_kubic_libcontainers_stable_cri-o]
  856. name=devel:kubic:libcontainers:stable:cri-o
  857. type=rpm-md
  858. baseurl=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/${version}/${os}/
  859. gpgcheck=1
  860. gpgkey=https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/${version}/${os}/repodata/repomd.xml.key
  861. enabled=1
  862. EOF
  863. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  864. [ -f "$(which runc)" ] && yum remove -y runc
  865. [ -f "$(which crio)" ] && yum remove -y cri-o
  866. [ -f "$(which docker)" ] && yum remove -y docker-ce docker-ce-cli containerd.io
  867. yum install -y runc cri-o bash-completion --disablerepo=docker-ce-stable || yum install -y runc cri-o bash-completion
  868. fi
  869. [ -d /etc/bash_completion.d ] && \
  870. { crictl completion bash > /etc/bash_completion.d/crictl; \
  871. crio completion bash > /etc/bash_completion.d/crio; \
  872. crio-status completion bash > /etc/bash_completion.d/crio-status; }
  873. [ ! -f /etc/crio/crio.conf ] && crio config --default > /etc/crio/crio.conf
  874. sed -i -e "s#k8s.gcr.io#registry.cn-hangzhou.aliyuncs.com/kainstall#g" \
  875. -e 's|#registries = \[|registries = ["docker.io", "quay.io"]|g' /etc/crio/crio.conf
  876. # /etc/crio/crio.conf
  877. cat << EOF >> /etc/crio/crio.conf
  878. [crio.image]
  879. pause_image = "registry.cn-hangzhou.aliyuncs.com/kainstall/pause:3.6"
  880. EOF
  881. # /etc/containers/registries.conf.d/000-dockerio.conf
  882. [ -d /etc/containers/registries.conf.d ] && cat << EOF > /etc/containers/registries.conf.d/000-dockerio.conf
  883. [[registry]]
  884. prefix = "docker.io"
  885. insecure = false
  886. blocked = false
  887. location = "docker.io"
  888. [[registry.mirror]]
  889. location = "573d5l8e.mirror.aliyuncs.com"
  890. insecure = true
  891. EOF
  892. # /etc/crictl.yaml
  893. cat << EOF > /etc/crictl.yaml
  894. runtime-endpoint: unix:///var/run/crio/crio.sock
  895. image-endpoint: unix:///var/run/crio/crio.sock
  896. timeout: 2
  897. debug: false
  898. pull-image-on-create: true
  899. disable-pull-on-run: false
  900. EOF
  901. # /etc/cni/net.d/100-crio-bridge.conf
  902. sed -i "s#10.85.0.0/16#${KUBE_POD_SUBNET:-10.85.0.0/16}#g" /etc/cni/net.d/100-crio-bridge.conf
  903. # /etc/cni/net.d/10-crio.conf
  904. cat << EOF > /etc/cni/net.d/10-crio.conf
  905. {
  906. $(grep cniVersion /etc/cni/net.d/100-crio-bridge.conf)
  907. "name": "crio",
  908. "type": "flannel"
  909. }
  910. EOF
  911. mv /etc/cni/net.d/100-crio-bridge.conf /etc/cni/net.d/10-crio.conf /etc/cni/net.d/200-loopback.conf /tmp/
  912. systemctl restart crio
  913. systemctl enable crio
  914. }
  915. # 安装kube组件
  916. function script::install_kube() {
  917. local version="-${1:-latest}"
  918. version="${version#-latest}"
  919. # /etc/yum.repos.d/kubernetes.repo
  920. cat <<EOF > /etc/yum.repos.d/kubernetes.repo
  921. [kubernetes]
  922. name=Kubernetes
  923. baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
  924. enabled=1
  925. gpgcheck=0
  926. repo_gpgcheck=0
  927. gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
  928. EOF
  929. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  930. [ -f /usr/bin/kubeadm ] && yum remove -y kubeadm
  931. [ -f /usr/bin/kubelet ] && yum remove -y kubelet
  932. [ -f /usr/bin/kubectl ] && yum remove -y kubectl
  933. yum install -y "kubeadm${version}" "kubelet${version}" "kubectl${version}" --disableexcludes=kubernetes
  934. fi
  935. [ -d /etc/bash_completion.d ] && \
  936. { kubectl completion bash > /etc/bash_completion.d/kubectl; \
  937. kubeadm completion bash > /etc/bash_completion.d/kubadm; }
  938. [ ! -d /usr/lib/systemd/system/kubelet.service.d ] && mkdir -p /usr/lib/systemd/system/kubelet.service.d
  939. cat << EOF > /usr/lib/systemd/system/kubelet.service.d/11-cgroup.conf
  940. [Service]
  941. CPUAccounting=true
  942. MemoryAccounting=true
  943. BlockIOAccounting=true
  944. ExecStartPre=/bin/bash -c '/bin/mkdir -p /sys/fs/cgroup/{cpuset,memory,hugetlb,systemd,pids,"cpu,cpuacct"}/{system,kube,kubepods}.slice||:'
  945. Slice=kube.slice
  946. EOF
  947. systemctl daemon-reload
  948. systemctl enable kubelet
  949. systemctl restart kubelet
  950. }
  951. # 安装haproxy
  952. function script::install_haproxy() {
  953. local api_servers="$*"
  954. if [[ "${OFFLINE_TAG:-}" != "1" ]];then
  955. [ -f /usr/bin/haproxy ] && yum remove -y haproxy
  956. yum install -y haproxy
  957. fi
  958. # /etc/haproxy/haproxy.cfg
  959. [ ! -f /etc/haproxy/haproxy.cfg_bak ] && cp /etc/haproxy/haproxy.cfg{,_bak}
  960. cat << EOF > /etc/haproxy/haproxy.cfg
  961. global
  962. log /dev/log local0
  963. log /dev/log local1 notice
  964. tune.ssl.default-dh-param 2048
  965. defaults
  966. log global
  967. mode http
  968. option dontlognull
  969. timeout connect 5000ms
  970. timeout client 600000ms
  971. timeout server 600000ms
  972. listen stats
  973. bind :19090
  974. mode http
  975. balance
  976. stats uri /haproxy_stats
  977. stats auth admin:admin123
  978. stats admin if TRUE
  979. frontend kube-apiserver-https
  980. mode tcp
  981. option tcplog
  982. bind :6443
  983. default_backend kube-apiserver-backend
  984. backend kube-apiserver-backend
  985. mode tcp
  986. balance roundrobin
  987. stick-table type ip size 200k expire 30m
  988. stick on src
  989. $(index=1;for h in $api_servers;do echo " server apiserver${index} $h:6443 check";index=$((index+1));done)
  990. EOF
  991. systemctl enable haproxy
  992. systemctl restart haproxy
  993. }
  994. # 检查用到的命令
  995. function check::command() {
  996. check::command_exists ssh openssh-clients
  997. check::command_exists sshpass sshpass
  998. check::command_exists wget wget
  999. [[ "${OFFLINE_TAG:-}" == "1" ]] && check::command_exists tar tar
  1000. }
  1001. # 检查ssh连通性
  1002. function check::ssh_conn() {
  1003. for host in $MASTER_NODES $WORKER_NODES
  1004. do
  1005. [ "$host" == "127.0.0.1" ] && continue
  1006. command::exec "${host}" "echo 0"
  1007. check::exit_code "$?" "check" "ssh $host connection" "exit"
  1008. done
  1009. }
  1010. # 检查os系统支持
  1011. function check::os() {
  1012. log::info "[check]" "os support: ${OS_SUPPORT}"
  1013. for host in $MASTER_NODES $WORKER_NODES
  1014. do
  1015. command::exec "${host}" "
  1016. [ -f /etc/os-release ] && source /etc/os-release
  1017. echo client_os:\${ID:-}\${VERSION_ID:-}
  1018. if [[ \"${OS_SUPPORT}\" == *\"\${ID:-}\${VERSION_ID:-}\"* ]]; then
  1019. exit 0
  1020. fi
  1021. exit 1
  1022. "
  1023. check::exit_code "$?" "check" "$host os support" "exit"
  1024. done
  1025. }
  1026. # 检查os kernel 版本
  1027. function check::kernel() {
  1028. local version=${1:-}
  1029. log::info "[check]" "kernel version not less than ${version}"
  1030. version=$(echo "${version}" | awk -F. '{ printf("%d%03d%03d\n", $1,$2,$3); }')
  1031. for host in $MASTER_NODES $WORKER_NODES
  1032. do
  1033. command::exec "${host}" "
  1034. kernel_version=\$(uname -r)
  1035. kernel_version=\$(echo \${kernel_version/-*} | awk -F. '{ printf(\"%d%03d%03d\n\", \$1,\$2,\$3); }')
  1036. echo kernel_version \${kernel_version}
  1037. [[ \${kernel_version} -ge ${version} ]] && exit 0 || exit 1
  1038. "
  1039. check::exit_code "$?" "check" "$host kernel version" "exit"
  1040. done
  1041. }
  1042. # 检查api-server连通性
  1043. function check::apiserver_conn() {
  1044. command::exec "${MGMT_NODE}" "kubectl get node"
  1045. check::exit_code "$?" "check" "conn apiserver" "exit"
  1046. }
  1047. # 检查返回码
  1048. function check::exit_code() {
  1049. local code=${1:-}
  1050. local app=${2:-}
  1051. local desc=${3:-}
  1052. local exit_script=${4:-}
  1053. if [[ "${code}" == "0" ]]; then
  1054. log::info "[${app}]" "${desc} succeeded."
  1055. else
  1056. log::error "[${app}]" "${desc} failed."
  1057. [[ "$exit_script" == "exit" ]] && exit "$code"
  1058. fi
  1059. }
  1060. # 预检
  1061. function check::preflight() {
  1062. # check command
  1063. check::command
  1064. # check ssh conn
  1065. check::ssh_conn
  1066. # check os
  1067. check::os
  1068. # check os kernel
  1069. [[ "${KUBE_NETWORK:-}" == "cilium" ]] && check::kernel 4.9.17
  1070. # check api-server conn
  1071. if [[ $(( ${ADD_TAG:-0} + ${DEL_TAG:-0} + ${UPGRADE_TAG:-0} + ${RENEW_CERT_TAG:-0} )) -gt 0 ]]; then
  1072. check::apiserver_conn
  1073. fi
  1074. }
  1075. # 安装包
  1076. function install::package() {
  1077. if [[ "${KUBE_CRI}" == "cri-o" && "${KUBE_CRI_VERSION}" == "latest" ]]; then
  1078. KUBE_CRI_VERSION="${KUBE_VERSION}"
  1079. if [[ "${KUBE_CRI_VERSION}" == "latest" ]]; then
  1080. if command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"; then
  1081. KUBE_CRI_VERSION="${COMMAND_OUTPUT#v}"
  1082. else
  1083. log::error "[install]" "get kubernetes stable version error. Please specify the version!"
  1084. exit 1
  1085. fi
  1086. fi
  1087. KUBE_CRI_VERSION="${KUBE_CRI_VERSION%.*}"
  1088. fi
  1089. for host in $MASTER_NODES $WORKER_NODES
  1090. do
  1091. # install cri
  1092. log::info "[install]" "install ${KUBE_CRI} on $host."
  1093. command::exec "${host}" "
  1094. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1095. $(declare -f script::install_"${KUBE_CRI}")
  1096. script::install_${KUBE_CRI} $KUBE_CRI_VERSION
  1097. "
  1098. check::exit_code "$?" "install" "install ${KUBE_CRI} on $host"
  1099. # install kube
  1100. log::info "[install]" "install kube on $host"
  1101. command::exec "${host}" "
  1102. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1103. $(declare -f script::install_kube)
  1104. script::install_kube $KUBE_VERSION
  1105. "
  1106. check::exit_code "$?" "install" "install kube on $host"
  1107. done
  1108. local apiservers=$MASTER_NODES
  1109. if [[ "$apiservers" == "127.0.0.1" ]]; then
  1110. command::exec "${MGMT_NODE}" "ip -o route get to 8.8.8.8 | sed -n 's/.*src \([0-9.]\+\).*/\1/p'"
  1111. get::command_output "apiservers" "$?"
  1112. fi
  1113. if [[ "${ADD_TAG:-}" == "1" ]]; then
  1114. command::exec "${MGMT_NODE}" "
  1115. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1116. "
  1117. get::command_output "apiservers" "$?"
  1118. fi
  1119. for host in $WORKER_NODES
  1120. do
  1121. # install haproxy
  1122. log::info "[install]" "install haproxy on $host"
  1123. command::exec "${host}" "
  1124. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1125. $(declare -f script::install_haproxy)
  1126. script::install_haproxy \"$apiservers\"
  1127. "
  1128. check::exit_code "$?" "install" "install haproxy on $host"
  1129. done
  1130. # 10年证书
  1131. if [[ "${CERT_YEAR_TAG:-}" == "1" ]]; then
  1132. local version="${KUBE_VERSION}"
  1133. if [[ "${version}" == "latest" ]]; then
  1134. if command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"; then
  1135. version="${COMMAND_OUTPUT#v}"
  1136. else
  1137. log::error "[install]" "get kubernetes stable version error. Please specify the version!"
  1138. exit 1
  1139. fi
  1140. fi
  1141. log::info "[install]" "download kubeadm 10 years certs client"
  1142. local certs_file="${OFFLINE_DIR}/bins/kubeadm-linux-amd64"
  1143. MGMT_NODE="127.0.0.1" utils::download_file "${GITHUB_PROXY}https://github.com/lework/kubeadm-certs/releases/download/v${version}/kubeadm-linux-amd64" "${certs_file}"
  1144. for host in $MASTER_NODES $WORKER_NODES
  1145. do
  1146. log::info "[install]" "scp kubeadm client to $host"
  1147. command::scp "${host}" "${certs_file}" "/tmp/kubeadm-linux-amd64"
  1148. check::exit_code "$?" "install" "scp kubeadm client to $host" "exit"
  1149. command::exec "${host}" "
  1150. set -e
  1151. if [[ -f /tmp/kubeadm-linux-amd64 ]]; then
  1152. [[ -f /usr/bin/kubeadm && ! -f /usr/bin/kubeadm_src ]] && mv -fv /usr/bin/kubeadm{,_src}
  1153. mv -fv /tmp/kubeadm-linux-amd64 /usr/bin/kubeadm
  1154. chmod +x /usr/bin/kubeadm
  1155. else
  1156. echo \"not found /tmp/kubeadm-linux-amd64\"
  1157. exit 1
  1158. fi
  1159. "
  1160. check::exit_code "$?" "install" "$host: use kubeadm 10 years certs client"
  1161. done
  1162. fi
  1163. }
  1164. # 升级节点内核
  1165. function init::upgrade_kernel() {
  1166. [[ "${UPGRADE_KERNEL_TAG:-}" != "1" ]] && return
  1167. for host in $MASTER_NODES $WORKER_NODES
  1168. do
  1169. log::info "[init]" "upgrade kernel: $host"
  1170. command::exec "${host}" "
  1171. export OFFLINE_TAG=${OFFLINE_TAG:-0}
  1172. $(declare -f script::upgrade_kernel)
  1173. script::upgrade_kernel
  1174. "
  1175. check::exit_code "$?" "init" "upgrade kernel $host" "exit"
  1176. done
  1177. for host in $MASTER_NODES $WORKER_NODES
  1178. do
  1179. command::exec "${host}" "bash -c 'sleep 15 && reboot' &>/dev/null &"
  1180. check::exit_code "$?" "init" "$host: Wait for 15s to restart"
  1181. done
  1182. log::info "[notice]" "Please execute the command again!"
  1183. log::access "[command]" "bash $0 ${SCRIPT_PARAMETER// --upgrade-kernel/}"
  1184. exit 0
  1185. }
  1186. # 节点证书续期
  1187. function cert::renew_node() {
  1188. local role="${1:-master}"
  1189. local hosts=""
  1190. local kubelet_config=""
  1191. command::exec "${MGMT_NODE}" "
  1192. kubectl get node --selector='node-role.kubernetes.io/${role}' -o jsonpath='{range.items[*]}{.metadata.name } {end}'
  1193. "
  1194. get::command_output "hosts" "$?"
  1195. for host in ${hosts}
  1196. do
  1197. log::info "[cert]" "drain $host"
  1198. command::exec "${MGMT_NODE}" "kubectl drain $host --force --ignore-daemonsets --delete-local-data"
  1199. check::exit_code "$?" "cert" "$host: drain"
  1200. sleep 5
  1201. if [[ "${role}" == "master" ]]; then
  1202. command::exec "${host}" "cp -rf /etc/kubernetes /etc/kubernetes_\$(date +%Y-%m-%d)"
  1203. check::exit_code "$?" "cert" "$host: backup kubernetes config"
  1204. command::exec "${host}" "kubeadm certs renew all 2>/dev/null|| kubeadm alpha certs renew all"
  1205. check::exit_code "$?" "cert" "$host: renew certs"
  1206. command::exec "${host}" "
  1207. $(declare -f utils::retry)
  1208. kill -s SIGHUP \$(pidof etcd) && \
  1209. utils::retry 10 \"echo -n | openssl s_client -connect localhost:2379 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1210. "
  1211. check::exit_code "$?" "cert" "$host: restart etcd"
  1212. command::exec "${host}" "
  1213. $(declare -f utils::retry)
  1214. kill -s SIGHUP \$(pidof kube-apiserver) && \
  1215. utils::retry 10 \"echo -n | openssl s_client -connect localhost:6443 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1216. "
  1217. check::exit_code "$?" "cert" "$host: restart kube-apiserver"
  1218. command::exec "${host}" "
  1219. $(declare -f utils::retry)
  1220. kill -s SIGHUP \$(pidof kube-controller-manager) && \
  1221. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10257 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1222. "
  1223. check::exit_code "$?" "cert" "$host: restart kube-controller-manager"
  1224. command::exec "${host}" "
  1225. $(declare -f utils::retry)
  1226. kill -s SIGHUP \$(pidof kube-scheduler) && \
  1227. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10259 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1228. "
  1229. check::exit_code "$?" "cert" "$host: restart kube-scheduler"
  1230. fi
  1231. log::info "[cert]" "get kubelet config"
  1232. command::exec "${MGMT_NODE}" "
  1233. kubeadm kubeconfig user --org system:nodes --client-name system:node:${host} --config /etc/kubernetes/kubeadmcfg.yaml || kubeadm alpha kubeconfig user --org system:nodes --client-name system:node:${host} --config /etc/kubernetes/kubeadmcfg.yaml
  1234. "
  1235. get::command_output "kubelet_config" "$?" "exit"
  1236. if [[ "$kubelet_config" != "" ]]; then
  1237. log::info "[cert]" "copy kubelet config"
  1238. command::exec "${host}" "
  1239. cp /etc/kubernetes/kubelet.conf /etc/kubernetes/kubelet.conf_bak
  1240. echo '$(printf "%s" "${kubelet_config}" | sed 's#https://.*:#https://127.0.0.1:#g')' > /etc/kubernetes/kubelet.conf
  1241. "
  1242. check::exit_code "$?" "cert" "$host: copy kubelet config"
  1243. command::exec "${host}" "rm -rfv /var/lib/kubelet/pki/*"
  1244. check::exit_code "$?" "cert" "$host: delete kubelet pki files"
  1245. command::exec "${host}" "
  1246. $(declare -f utils::retry)
  1247. systemctl restart kubelet && \
  1248. utils::retry 10 \"echo -n | openssl s_client -connect localhost:10250 2>&1 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | openssl x509 -text -noout | grep Not\"
  1249. "
  1250. local status="$?"
  1251. check::exit_code "${status}" "cert" "$host: restart kubelet"
  1252. if [[ "${status}" == "0" ]]; then
  1253. sleep 5
  1254. command::exec "${MGMT_NODE}" "kubectl uncordon ${host}"
  1255. check::exit_code "$?" "cert" "uncordon ${host} node"
  1256. fi
  1257. fi
  1258. done
  1259. }
  1260. # 证书续期
  1261. function cert::renew() {
  1262. log::info "[cert]" "renew cluster cert"
  1263. cert::renew_node "master"
  1264. cert::renew_node "worker"
  1265. log::info "[cert]" "cluster status"
  1266. command::exec "${MGMT_NODE}" "
  1267. echo
  1268. kubectl get node
  1269. echo
  1270. kubeadm certs check-expiration 2>/dev/null || kubeadm alpha certs check-expiration
  1271. " && printf "%s" "${COMMAND_OUTPUT}"
  1272. }
  1273. # 初始化节点配置
  1274. function init::node_config() {
  1275. local master_index=${master_index:-1}
  1276. local worker_index=${worker_index:-1}
  1277. log::info "[init]" "Get $MGMT_NODE InternalIP."
  1278. command::exec "${MGMT_NODE}" "
  1279. ip -4 route get 8.8.8.8 2>/dev/null | head -1 | awk '{print \$7}'
  1280. "
  1281. get::command_output "MGMT_NODE_IP" "$?" "exit"
  1282. # master
  1283. for host in $MASTER_NODES
  1284. do
  1285. log::info "[init]" "master: $host"
  1286. command::exec "${host}" "
  1287. export OFFLINE_TAG=${OFFLINE_TAG:-0} KUBE_APISERVER=${KUBE_APISERVER} SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  1288. $(declare -f script::init_node)
  1289. script::init_node
  1290. "
  1291. check::exit_code "$?" "init" "init master $host" "exit"
  1292. # 设置主机名和解析
  1293. command::exec "${host}" "
  1294. printf \"\\n${MGMT_NODE_IP} $KUBE_APISERVER\\n$node_hosts\" >> /etc/hosts
  1295. hostnamectl set-hostname ${HOSTNAME_PREFIX}-master-node${master_index}
  1296. "
  1297. check::exit_code "$?" "init" "$host set hostname and hostname resolution"
  1298. # set audit-policy
  1299. log::info "[init]" "$host: set audit-policy file."
  1300. command::exec "${host}" "
  1301. [ ! -d etc/kubernetes ] && mkdir -p /etc/kubernetes
  1302. cat << EOF > /etc/kubernetes/audit-policy.yaml
  1303. # Log all requests at the Metadata level.
  1304. apiVersion: audit.k8s.io/v1
  1305. kind: Policy
  1306. rules:
  1307. - level: Metadata
  1308. EOF
  1309. "
  1310. check::exit_code "$?" "init" "$host: set audit-policy file" "exit"
  1311. master_index=$((master_index + 1))
  1312. done
  1313. # worker
  1314. for host in $WORKER_NODES
  1315. do
  1316. log::info "[init]" "worker: $host"
  1317. command::exec "${host}" "
  1318. export OFFLINE_TAG=${OFFLINE_TAG:-0} KUBE_APISERVER=${KUBE_APISERVER} SKIP_SET_OS_REPO=${SKIP_SET_OS_REPO:-false}
  1319. $(declare -f script::init_node)
  1320. script::init_node
  1321. "
  1322. check::exit_code "$?" "init" "init worker $host" "exit"
  1323. # 设置主机名和解析
  1324. command::exec "${host}" "
  1325. printf \"\\n127.0.0.1 $KUBE_APISERVER\\n$node_hosts\" >> /etc/hosts
  1326. hostnamectl set-hostname ${HOSTNAME_PREFIX}-worker-node${worker_index}
  1327. "
  1328. worker_index=$((worker_index + 1))
  1329. done
  1330. }
  1331. # 初始化节点
  1332. function init::node() {
  1333. init::upgrade_kernel
  1334. local node_hosts=""
  1335. local i=1
  1336. for h in $MASTER_NODES
  1337. do
  1338. node_hosts="${node_hosts}\n$h ${HOSTNAME_PREFIX}-master-node${i}"
  1339. i=$((i + 1))
  1340. done
  1341. local i=1
  1342. for h in $WORKER_NODES
  1343. do
  1344. node_hosts="${node_hosts}\n$h ${HOSTNAME_PREFIX}-worker-node${i}"
  1345. i=$((i + 1))
  1346. done
  1347. init::node_config
  1348. }
  1349. # 初始化添加的节点
  1350. function init::add_node() {
  1351. init::upgrade_kernel
  1352. local master_index=0
  1353. local worker_index=0
  1354. local node_hosts=""
  1355. local add_node_hosts=""
  1356. command::exec "${MGMT_NODE}" "
  1357. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address } {end}' | awk '{print \$1}'
  1358. "
  1359. get::command_output "MGMT_NODE" "$?" "exit"
  1360. # 获取现有集群节点主机名
  1361. command::exec "${MGMT_NODE}" "
  1362. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {.metadata.name }\\n{end}'
  1363. "
  1364. get::command_output "node_hosts" "$?" "exit"
  1365. for host in $MASTER_NODES $WORKER_NODES
  1366. do
  1367. if [[ $node_hosts == *"$host"* ]]; then
  1368. log::error "[init]" "The host $host is already in the cluster!"
  1369. exit 1
  1370. fi
  1371. done
  1372. if [[ "$MASTER_NODES" != "" ]]; then
  1373. command::exec "${MGMT_NODE}" "
  1374. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].metadata.name}' |grep -Eo 'node[0-9]*'|grep -Eo '[0-9]*'|awk -F ' ' 'BEGIN {max = 0} {if (\$0+0 > max+0) max=\$0} END {print max}'
  1375. "
  1376. get::command_output "master_index" "$?" "exit"
  1377. master_index=$(( master_index + 1 ))
  1378. local i=$master_index
  1379. for host in $MASTER_NODES
  1380. do
  1381. add_node_hosts="${add_node_hosts}\n${host:-} ${HOSTNAME_PREFIX}-master-node${i}"
  1382. i=$((i + 1))
  1383. done
  1384. fi
  1385. if [[ "$WORKER_NODES" != "" ]]; then
  1386. command::exec "${MGMT_NODE}" "
  1387. kubectl get node --selector='node-role.kubernetes.io/worker' -o jsonpath='{\$.items[*].metadata.name}'| grep -Eo 'node[0-9]*'|grep -Eo '[0-9]*'|awk 'BEGIN {max = 0} {if (\$0+0 > max+0) max=\$0} END {print max}' || echo 0
  1388. "
  1389. get::command_output "worker_index" "$?" "exit"
  1390. worker_index=$(( worker_index + 1 ))
  1391. local i=$worker_index
  1392. for host in $WORKER_NODES
  1393. do
  1394. add_node_hosts="${add_node_hosts}\n${host:-} ${HOSTNAME_PREFIX}-worker-node${i}"
  1395. i=$((i + 1))
  1396. done
  1397. fi
  1398. # 向集群节点添加新增的节点主机名解析
  1399. for host in $(echo -ne "$node_hosts" | awk '{print $1}')
  1400. do
  1401. command::exec "${host}" "
  1402. printf \"$add_node_hosts\" >> /etc/hosts
  1403. "
  1404. check::exit_code "$?" "init" "$host add new node hostname resolution"
  1405. done
  1406. node_hosts="${node_hosts}\n${add_node_hosts}"
  1407. init::node_config
  1408. }
  1409. # 集群初始化
  1410. function kubeadm::init() {
  1411. log::info "[kubeadm init]" "kubeadm init on ${MGMT_NODE}"
  1412. log::info "[kubeadm init]" "${MGMT_NODE}: set kubeadmcfg.yaml"
  1413. command::exec "${MGMT_NODE}" "
  1414. PAUSE_VERSION=$(kubeadm config images list 2>/dev/null | awk -F: '/pause/ {print $2}')
  1415. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1416. ---
  1417. apiVersion: kubeadm.k8s.io/v1beta2
  1418. kind: InitConfiguration
  1419. ${kubelet_nodeRegistration}
  1420. ---
  1421. apiVersion: kubeproxy.config.k8s.io/v1alpha1
  1422. kind: KubeProxyConfiguration
  1423. mode: ipvs
  1424. ipvs:
  1425. minSyncPeriod: 5s
  1426. syncPeriod: 5s
  1427. # ipvs 负载策略
  1428. scheduler: 'wrr'
  1429. ---
  1430. apiVersion: kubelet.config.k8s.io/v1beta1
  1431. kind: KubeletConfiguration
  1432. maxPods: 200
  1433. cgroupDriver: systemd
  1434. runtimeRequestTimeout: 5m
  1435. # 此配置保证了 kubelet 能在 swap 开启的情况下启动
  1436. failSwapOn: false
  1437. nodeStatusUpdateFrequency: 5s
  1438. rotateCertificates: true
  1439. imageGCLowThresholdPercent: 70
  1440. imageGCHighThresholdPercent: 80
  1441. # 软驱逐阀值
  1442. evictionSoft:
  1443. imagefs.available: 15%
  1444. memory.available: 512Mi
  1445. nodefs.available: 15%
  1446. nodefs.inodesFree: 10%
  1447. # 达到软阈值之后,持续时间超过多久才进行驱逐
  1448. evictionSoftGracePeriod:
  1449. imagefs.available: 3m
  1450. memory.available: 1m
  1451. nodefs.available: 3m
  1452. nodefs.inodesFree: 1m
  1453. # 硬驱逐阀值
  1454. evictionHard:
  1455. imagefs.available: 10%
  1456. memory.available: 256Mi
  1457. nodefs.available: 10%
  1458. nodefs.inodesFree: 5%
  1459. evictionMaxPodGracePeriod: 30
  1460. # 节点资源预留
  1461. kubeReserved:
  1462. cpu: 200m\$(if [[ \$(cat /proc/meminfo | awk '/MemTotal/ {print \$2}') -gt 3670016 ]]; then echo -e '\n memory: 256Mi';fi)
  1463. ephemeral-storage: 1Gi
  1464. systemReserved:
  1465. cpu: 300m\$(if [[ \$(cat /proc/meminfo | awk '/MemTotal/ {print \$2}') -gt 3670016 ]]; then echo -e '\n memory: 512Mi';fi)
  1466. ephemeral-storage: 1Gi
  1467. kubeReservedCgroup: /kube.slice
  1468. systemReservedCgroup: /system.slice
  1469. enforceNodeAllocatable:
  1470. - pods
  1471. ---
  1472. apiVersion: kubeadm.k8s.io/v1beta2
  1473. kind: ClusterConfiguration
  1474. kubernetesVersion: $KUBE_VERSION
  1475. controlPlaneEndpoint: $KUBE_APISERVER:6443
  1476. networking:
  1477. dnsDomain: $KUBE_DNSDOMAIN
  1478. podSubnet: $KUBE_POD_SUBNET
  1479. serviceSubnet: $KUBE_SERVICE_SUBNET
  1480. imageRepository: $KUBE_IMAGE_REPO
  1481. apiServer:
  1482. certSANs:
  1483. - 127.0.0.1
  1484. - $KUBE_APISERVER
  1485. $(for h in $MASTER_NODES;do echo " - $h";done)
  1486. extraArgs:
  1487. event-ttl: '720h'
  1488. service-node-port-range: '30000-50000'
  1489. # 审计日志相关配置
  1490. audit-log-maxage: '20'
  1491. audit-log-maxbackup: '10'
  1492. audit-log-maxsize: '100'
  1493. audit-log-path: /var/log/kube-audit/audit.log
  1494. audit-policy-file: /etc/kubernetes/audit-policy.yaml
  1495. extraVolumes:
  1496. - name: audit-config
  1497. hostPath: /etc/kubernetes/audit-policy.yaml
  1498. mountPath: /etc/kubernetes/audit-policy.yaml
  1499. readOnly: true
  1500. pathType: File
  1501. - name: audit-log
  1502. hostPath: /var/log/kube-audit
  1503. mountPath: /var/log/kube-audit
  1504. pathType: DirectoryOrCreate
  1505. - name: localtime
  1506. hostPath: /etc/localtime
  1507. mountPath: /etc/localtime
  1508. readOnly: true
  1509. pathType: File
  1510. controllerManager:
  1511. extraArgs:
  1512. bind-address: 0.0.0.0
  1513. node-cidr-mask-size: '24'
  1514. deployment-controller-sync-period: '10s'
  1515. node-monitor-grace-period: '20s'
  1516. pod-eviction-timeout: '2m'
  1517. terminated-pod-gc-threshold: '30'
  1518. experimental-cluster-signing-duration: 87600h
  1519. feature-gates: RotateKubeletServerCertificate=true
  1520. extraVolumes:
  1521. - hostPath: /etc/localtime
  1522. mountPath: /etc/localtime
  1523. name: localtime
  1524. readOnly: true
  1525. pathType: File
  1526. scheduler:
  1527. extraArgs:
  1528. bind-address: 0.0.0.0
  1529. extraVolumes:
  1530. - hostPath: /etc/localtime
  1531. mountPath: /etc/localtime
  1532. name: localtime
  1533. readOnly: true
  1534. pathType: File
  1535. $(if [[ "${KUBE_VERSION}" == "1.21.1" ]]; then
  1536. echo "dns:
  1537. type: CoreDNS
  1538. imageRepository: docker.io
  1539. imageTag: 1.8.0"
  1540. fi)
  1541. EOF
  1542. "
  1543. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: set kubeadmcfg.yaml" "exit"
  1544. log::info "[kubeadm init]" "${MGMT_NODE}: kubeadm init start."
  1545. command::exec "${MGMT_NODE}" "kubeadm init --config=/etc/kubernetes/kubeadmcfg.yaml --upload-certs"
  1546. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: kubeadm init" "exit"
  1547. sleep 3
  1548. log::info "[kubeadm init]" "${MGMT_NODE}: set kube config."
  1549. command::exec "${MGMT_NODE}" "
  1550. mkdir -p \$HOME/.kube
  1551. sudo cp -f /etc/kubernetes/admin.conf \$HOME/.kube/config
  1552. "
  1553. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: set kube config" "exit"
  1554. if [[ "$(echo "$MASTER_NODES" | wc -w)" == "1" ]]; then
  1555. log::info "[kubeadm init]" "${MGMT_NODE}: delete master taint"
  1556. command::exec "${MGMT_NODE}" "kubectl taint nodes --all node-role.kubernetes.io/master-"
  1557. check::exit_code "$?" "kubeadm init" "${MGMT_NODE}: delete master taint"
  1558. fi
  1559. command::exec "${MGMT_NODE}" "
  1560. kubectl create clusterrolebinding node-client-auto-approve-csr --clusterrole=system:certificates.k8s.io:certificatesigningrequests:nodeclient --user=kubelet-bootstrap
  1561. kubectl create clusterrolebinding node-client-auto-renew-crt --clusterrole=system:certificates.k8s.io:certificatesigningrequests:selfnodeclient --group=system:nodes
  1562. kubectl create clusterrolebinding node-server-auto-renew-crt --clusterrole=system:certificates.k8s.io:certificatesigningrequests:selfnodeserver --group=system:nodes
  1563. "
  1564. check::exit_code "$?" "kubeadm init" "Auto-Approve kubelet cert csr" "exit"
  1565. }
  1566. # 加入集群
  1567. function kubeadm::join() {
  1568. log::info "[kubeadm join]" "master: get join token and cert info"
  1569. command::exec "${MGMT_NODE}" "
  1570. openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
  1571. "
  1572. get::command_output "CACRT_HASH" "$?" "exit"
  1573. command::exec "${MGMT_NODE}" "
  1574. kubeadm init phase upload-certs --upload-certs --config /etc/kubernetes/kubeadmcfg.yaml 2>> /dev/null | tail -1
  1575. "
  1576. get::command_output "INTI_CERTKEY" "$?" "exit"
  1577. command::exec "${MGMT_NODE}" "
  1578. kubeadm token create
  1579. "
  1580. get::command_output "INIT_TOKEN" "$?" "exit"
  1581. command::exec "${MGMT_NODE}" "
  1582. kubeadm config images list 2>/dev/null | awk -F: '/pause/ {print \$2}'
  1583. "
  1584. get::command_output "PAUSE_VERSION" "$?"
  1585. for host in $MASTER_NODES
  1586. do
  1587. [[ "${MGMT_NODE}" == "$host" ]] && continue
  1588. log::info "[kubeadm join]" "master $host join cluster."
  1589. command::exec "${host}" "
  1590. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1591. ---
  1592. apiVersion: kubeadm.k8s.io/v1beta2
  1593. kind: JoinConfiguration
  1594. discovery:
  1595. bootstrapToken:
  1596. apiServerEndpoint: $KUBE_APISERVER:6443
  1597. caCertHashes:
  1598. - sha256:${CACRT_HASH:-}
  1599. token: ${INIT_TOKEN}
  1600. timeout: 5m0s
  1601. controlPlane:
  1602. certificateKey: ${INTI_CERTKEY:-}
  1603. ${kubelet_nodeRegistration}
  1604. EOF
  1605. kubeadm join --config /etc/kubernetes/kubeadmcfg.yaml
  1606. "
  1607. check::exit_code "$?" "kubeadm join" "master $host join cluster"
  1608. log::info "[kubeadm join]" "$host: set kube config."
  1609. command::exec "${host}" "
  1610. mkdir -p \$HOME/.kube
  1611. sudo cp -f /etc/kubernetes/admin.conf \$HOME/.kube/config
  1612. "
  1613. check::exit_code "$?" "kubeadm join" "$host: set kube config" "exit"
  1614. command::exec "${host}" "
  1615. sed -i 's#.*$KUBE_APISERVER#127.0.0.1 $KUBE_APISERVER#g' /etc/hosts
  1616. "
  1617. done
  1618. for host in $WORKER_NODES
  1619. do
  1620. log::info "[kubeadm join]" "worker $host join cluster."
  1621. command::exec "${host}" "
  1622. mkdir -p /etc/kubernetes/manifests
  1623. cat << EOF > /etc/kubernetes/kubeadmcfg.yaml
  1624. ---
  1625. apiVersion: kubeadm.k8s.io/v1beta2
  1626. kind: JoinConfiguration
  1627. discovery:
  1628. bootstrapToken:
  1629. apiServerEndpoint: $KUBE_APISERVER:6443
  1630. caCertHashes:
  1631. - sha256:${CACRT_HASH:-}
  1632. token: ${INIT_TOKEN}
  1633. timeout: 5m0s
  1634. ${kubelet_nodeRegistration}
  1635. EOF
  1636. kubeadm join --config /etc/kubernetes/kubeadmcfg.yaml
  1637. "
  1638. check::exit_code "$?" "kubeadm join" "worker $host join cluster"
  1639. log::info "[kubeadm join]" "set $host worker node role."
  1640. command::exec "${MGMT_NODE}" "
  1641. kubectl get node --selector='!node-role.kubernetes.io/master' | grep '<none>' | awk '{print \"kubectl label node \" \$1 \" node-role.kubernetes.io/worker= --overwrite\" }' | bash
  1642. "
  1643. check::exit_code "$?" "kubeadm join" "set $host worker node role"
  1644. done
  1645. }
  1646. # 等待资源完成
  1647. function kube::wait() {
  1648. local app=$1
  1649. local namespace=$2
  1650. local resource=$3
  1651. local selector=${4:-}
  1652. sleep 3
  1653. log::info "[waiting]" "waiting $app"
  1654. command::exec "${MGMT_NODE}" "
  1655. $(declare -f utils::retry)
  1656. utils::retry 6 kubectl wait --namespace ${namespace} \
  1657. --for=condition=ready ${resource} \
  1658. --selector=$selector \
  1659. --timeout=60s
  1660. "
  1661. local status="$?"
  1662. check::exit_code "$status" "waiting" "$app ${resource} ready"
  1663. return "$status"
  1664. }
  1665. # 应用manifest
  1666. function kube::apply() {
  1667. local file=$1
  1668. log::info "[apply]" "$file"
  1669. command::exec "${MGMT_NODE}" "
  1670. $(declare -f utils::retry)
  1671. if [ -f \"$file\" ]; then
  1672. utils::retry 6 kubectl apply --wait=true --timeout=10s -f \"$file\"
  1673. else
  1674. utils::retry 6 \"cat <<EOF | kubectl apply --wait=true --timeout=10s -f -
  1675. \$(printf \"%s\" \"${2:-}\")
  1676. EOF
  1677. \"
  1678. fi
  1679. "
  1680. local status="$?"
  1681. check::exit_code "$status" "apply" "add $file" "exit"
  1682. return "$status"
  1683. }
  1684. # 集群状态
  1685. function kube::status() {
  1686. sleep 5
  1687. log::info "[cluster]" "cluster status"
  1688. command::exec "${MGMT_NODE}" "
  1689. echo
  1690. kubectl get node -o wide
  1691. echo
  1692. kubectl get pods -A
  1693. " && printf "%s" "${COMMAND_OUTPUT}"
  1694. }
  1695. # 添加或删除haproxy的后端server
  1696. function config::haproxy_backend() {
  1697. local action=${1:-add}
  1698. local action_cmd=""
  1699. local master_nodes
  1700. if [[ "$MASTER_NODES" == "" || "$MASTER_NODES" == "127.0.0.1" ]]; then
  1701. return
  1702. fi
  1703. command::exec "${MGMT_NODE}" "
  1704. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1705. "
  1706. get::command_output "master_nodes" "$?" "exit"
  1707. for m in $MASTER_NODES
  1708. do
  1709. if [[ "${action}" == "add" ]]; then
  1710. num=$(echo "${m}"| awk -F'.' '{print $4}')
  1711. action_cmd="${action_cmd}\necho \" server apiserver${num} ${m}:6443 check\" >> /etc/haproxy/haproxy.cfg"
  1712. else
  1713. [[ "${master_nodes}" == *"${m}"* ]] || return
  1714. action_cmd="${action_cmd}\n sed -i -e \"/${m}/d\" /etc/haproxy/haproxy.cfg"
  1715. fi
  1716. done
  1717. command::exec "${MGMT_NODE}" "
  1718. kubectl get node --selector='!node-role.kubernetes.io/master' -o jsonpath='{\$.items[*].status.addresses[?(@.type==\"InternalIP\")].address}'
  1719. "
  1720. get::command_output "worker_nodes" "$?"
  1721. for host in ${worker_nodes:-}
  1722. do
  1723. log::info "[config]" "worker ${host}: ${action} apiserver from haproxy"
  1724. command::exec "${host}" "
  1725. $(echo -ne "${action_cmd}")
  1726. haproxy -c -f /etc/haproxy/haproxy.cfg && systemctl reload haproxy
  1727. "
  1728. check::exit_code "$?" "config" "worker ${host}: ${action} apiserver(${m}) from haproxy"
  1729. done
  1730. }
  1731. # 更新 etcd 备份副本
  1732. function config::etcd_snapshot() {
  1733. command::exec "${MGMT_NODE}" "
  1734. count=\$(kubectl get node --selector='node-role.kubernetes.io/master' --no-headers | wc -l)
  1735. kubectl -n kube-system patch cronjobs etcd-snapshot --patch \"
  1736. spec:
  1737. jobTemplate:
  1738. spec:
  1739. completions: \${count:-1}
  1740. parallelism: \${count:-1}
  1741. \"
  1742. "
  1743. check::exit_code "$?" "config" "etcd-snapshot completions options"
  1744. }
  1745. # 获取命令的返回值
  1746. function get::command_output() {
  1747. local app="$1"
  1748. local status="$2"
  1749. local is_exit="${3:-}"
  1750. if [[ "$status" == "0" && "${COMMAND_OUTPUT}" != "" ]]; then
  1751. log::info "[command]" "get $app value succeeded."
  1752. eval "$app=\"${COMMAND_OUTPUT}\""
  1753. else
  1754. log::error "[command]" "get $app value failed."
  1755. [[ "$is_exit" == "exit" ]] && exit "$status"
  1756. fi
  1757. return "$status"
  1758. }
  1759. # 获取ingress连接地址
  1760. function get::ingress_conn() {
  1761. local port="${1:-80}"
  1762. local ingress_name="${2:-ingress-${KUBE_INGRESS}-controller}"
  1763. command::exec "${MGMT_NODE}" "
  1764. kubectl get node -o jsonpath='{range .items[*]}{ .status.addresses[?(@.type==\"InternalIP\")].address} {.status.conditions[?(@.status == \"True\")].status}{\"\\n\"}{end}' | awk '{if(\$2==\"True\")a=\$1}END{print a}'
  1765. "
  1766. get::command_output "node_ip" "$?"
  1767. command::exec "${MGMT_NODE}" "
  1768. kubectl get svc --all-namespaces -o go-template=\"{{range .items}}{{if eq .metadata.name \\\"${ingress_name}\\\"}}{{range.spec.ports}}{{if eq .port ${port}}}{{.nodePort}}{{end}}{{end}}{{end}}{{end}}\"
  1769. "
  1770. get::command_output "node_port" "$?"
  1771. INGRESS_CONN="${node_ip:-nodeIP}:${node_port:-nodePort}"
  1772. }
  1773. ######################################################################################################
  1774. # 主调用逻辑
  1775. ######################################################################################################
  1776. # 添加network组件
  1777. function add::network() {
  1778. if [[ "$KUBE_NETWORK" == "flannel" ]]; then
  1779. log::info "[network]" "add flannel"
  1780. local flannel_file="${OFFLINE_DIR}/manifests/kube-flannel.yml"
  1781. utils::download_file "https://cdn.jsdelivr.net/gh/coreos/flannel@v${FLANNEL_VERSION}/Documentation/kube-flannel.yml" "${flannel_file}"
  1782. command::exec "${MGMT_NODE}" "
  1783. sed -i -e 's#10.244.0.0/16#${KUBE_POD_SUBNET}#g' \
  1784. -e 's#quay.io/coreos#${KUBE_IMAGE_REPO}#g' \
  1785. -e 's#\"Type\": \"vxlan\"#\"Type\": \"${KUBE_FLANNEL_TYPE}\"#g' \"${flannel_file}\"
  1786. if [[ \"${KUBE_FLANNEL_TYPE}\" == \"vxlan\" ]]; then
  1787. sed -i 's#\"Type\": \"vxlan\"#\"Type\": \"vxlan\", \"DirectRouting\": true#g' \"${flannel_file}\"
  1788. fi
  1789. "
  1790. check::exit_code "$?" "flannel" "change flannel pod subnet"
  1791. kube::apply "${flannel_file}"
  1792. kube::wait "flannel" "kube-system" "pods" "app=flannel"
  1793. elif [[ "$KUBE_NETWORK" == "calico" ]]; then
  1794. log::info "[network]" "add calico"
  1795. utils::download_file "https://projectcalico.docs.tigera.io/archive/v${CALICO_VERSION%.*}/manifests/calico.yaml" "${OFFLINE_DIR}/manifests/calico.yaml"
  1796. utils::download_file "https://projectcalico.docs.tigera.io/archive/v${CALICO_VERSION%.*}/manifests/calicoctl.yaml" "${OFFLINE_DIR}/manifests/calicoctl.yaml"
  1797. command::exec "${MGMT_NODE}" "
  1798. sed -i \"s#:v.*#:v${CALICO_VERSION}#g\" \"${OFFLINE_DIR}/manifests/calico.yaml\"
  1799. sed -i 's#value: \"Always\"#value: \"CrossSubnet\"#g' \"${OFFLINE_DIR}/manifests/calico.yaml\"
  1800. sed -i \"s#:v.*#:v${CALICO_VERSION}#g\" \"${OFFLINE_DIR}/manifests/calicoctl.yaml\"
  1801. "
  1802. check::exit_code "$?" "network" "change calico version to ${CALICO_VERSION}"
  1803. kube::apply "${OFFLINE_DIR}/manifests/calico.yaml"
  1804. kube::apply "${OFFLINE_DIR}/manifests/calicoctl.yaml"
  1805. kube::wait "calico-kube-controllers" "kube-system" "pods" "k8s-app=calico-kube-controllers"
  1806. kube::wait "calico-node" "kube-system" "pods" "k8s-app=calico-node"
  1807. elif [[ "$KUBE_NETWORK" == "cilium" ]]; then
  1808. log::info "[network]" "add cilium"
  1809. local cilium_file="${OFFLINE_DIR}/manifests/cilium.yml"
  1810. local cilium_hubble_file="${OFFLINE_DIR}/manifests/cilium_hubble.yml"
  1811. utils::download_file "https://cdn.jsdelivr.net/gh/cilium/cilium@${CILIUM_VERSION}/install/kubernetes/quick-install.yaml" "${cilium_file}"
  1812. utils::download_file "https://cdn.jsdelivr.net/gh/cilium/cilium@${CILIUM_VERSION}/install/kubernetes/quick-hubble-install.yaml" "${cilium_hubble_file}"
  1813. local all_node=""
  1814. if [[ "${MASTER_NODES}" == "" && "${WORKER_NODES}" == "" ]]; then
  1815. command::exec "${MGMT_NODE}" "
  1816. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {end}'
  1817. "
  1818. get::command_output "all_node" "$?"
  1819. else
  1820. all_node="${MASTER_NODES} ${WORKER_NODES}"
  1821. fi
  1822. for host in $all_node
  1823. do
  1824. command::exec "${host}" "mount bpffs -t bpf /sys/fs/bpf"
  1825. check::exit_code "$?" "network" "${host}: mount bpf filesystem"
  1826. done
  1827. command::exec "${MGMT_NODE}" "
  1828. sed -i \"s#10.0.0.0/8#${KUBE_POD_SUBNET}#g\" \"${cilium_file}\"
  1829. "
  1830. kube::apply "${cilium_file}"
  1831. kube::wait "cilium-node" "kube-system" "pods" "k8s-app=cilium"
  1832. kube::wait "cilium-operator" "kube-system" "pods" "name=cilium-operator"
  1833. kube::apply "${cilium_hubble_file}"
  1834. kube::wait "hubble-relay" "kube-system" "pods" "k8s-app=hubble-relay"
  1835. log::info "[monitor]" "add hubble-ui ingress"
  1836. kube::apply "hubble-ui ingress" "
  1837. ---
  1838. apiVersion: networking.k8s.io/v1
  1839. kind: Ingress
  1840. metadata:
  1841. name: hubble-ui
  1842. namespace: kube-system
  1843. annotations:
  1844. kubernetes.io/ingress.class: ${KUBE_INGRESS}
  1845. spec:
  1846. rules:
  1847. - host: hubble-ui.cluster.local
  1848. http:
  1849. paths:
  1850. - path: /
  1851. pathType: Prefix
  1852. backend:
  1853. service:
  1854. name: hubble-ui
  1855. port:
  1856. number: 80
  1857. "
  1858. # shellcheck disable=SC2181
  1859. if [[ "$?" == "0" ]]; then
  1860. get::ingress_conn
  1861. log::access "[ingress]" "curl -H 'Host:hubble-ui.cluster.local' http://${INGRESS_CONN}"
  1862. fi
  1863. else
  1864. log::warning "[network]" "No $KUBE_NETWORK config."
  1865. fi
  1866. }
  1867. # 添加ingress组件
  1868. function add::ingress() {
  1869. # TODO add ingress
  1870. log::info "[TODO]" "add ingress"
  1871. }
  1872. # 添加addon组件
  1873. function add::addon() {
  1874. # TODO add addon
  1875. log::info "[TODO]" "add addon"
  1876. }
  1877. # 添加监控组件
  1878. function add::monitor() {
  1879. # TODO add addon
  1880. log::info "[TODO]" "add addon"
  1881. }
  1882. # 添加log组件
  1883. function add::log() {
  1884. # TODO add addon
  1885. log::info "[TODO]" "add addon"
  1886. }
  1887. # 添加存储
  1888. function add::storage() {
  1889. # TODO add addon
  1890. log::info "[TODO]" "add addon"
  1891. }
  1892. # 添加用户界面
  1893. function add::ui() {
  1894. # TODO add addon
  1895. log::info "[TODO]" "add addon"
  1896. }
  1897. # 运维操作
  1898. function add::ops() {
  1899. local master_num
  1900. master_num=$(awk '{print NF}' <<< "${MASTER_NODES}")
  1901. log::info "[ops]" "add anti-affinity strategy to coredns"
  1902. command::exec "${MGMT_NODE}" """
  1903. kubectl -n kube-system patch deployment coredns --patch '{\"spec\": {\"template\": {\"spec\": {\"affinity\":{\"podAntiAffinity\":{\"preferredDuringSchedulingIgnoredDuringExecution\":[{\"weight\":100,\"podAffinityTerm\":{\"labelSelector\":{\"matchExpressions\":[{\"key\":\"k8s-app\",\"operator\":\"In\",\"values\":[\"kube-dns\"]}]},\"topologyKey\":\"kubernetes.io/hostname\"}}]}}}}}}' --record
  1904. """
  1905. check::exit_code "$?" "ops" "add anti-affinity strategy to coredns"
  1906. log::info "[ops]" "add etcd snapshot cronjob"
  1907. command::exec "${MGMT_NODE}" "
  1908. kubeadm config images list --config=/etc/kubernetes/kubeadmcfg.yaml 2>/dev/null | grep etcd:
  1909. "
  1910. get::command_output "etcd_image" "$?"
  1911. command::exec "${MGMT_NODE}" "
  1912. kubectl get node --selector='node-role.kubernetes.io/master' --no-headers | wc -l
  1913. "
  1914. get::command_output "master_num" "$?"
  1915. [[ "${master_num:-0}" == "0" ]] && master_num=1
  1916. kube::apply "etcd-snapshot" """
  1917. ---
  1918. apiVersion: batch/v1beta1
  1919. kind: CronJob
  1920. metadata:
  1921. name: etcd-snapshot
  1922. namespace: kube-system
  1923. spec:
  1924. schedule: '0 */6 * * *'
  1925. successfulJobsHistoryLimit: 3
  1926. suspend: false
  1927. concurrencyPolicy: Allow
  1928. failedJobsHistoryLimit: 3
  1929. jobTemplate:
  1930. spec:
  1931. backoffLimit: 6
  1932. parallelism: ${master_num}
  1933. completions: ${master_num}
  1934. template:
  1935. metadata:
  1936. labels:
  1937. app: etcd-snapshot
  1938. spec:
  1939. affinity:
  1940. podAntiAffinity:
  1941. requiredDuringSchedulingIgnoredDuringExecution:
  1942. - labelSelector:
  1943. matchExpressions:
  1944. - key: app
  1945. operator: In
  1946. values:
  1947. - etcd-snapshot
  1948. topologyKey: 'kubernetes.io/hostname'
  1949. containers:
  1950. - name: etcd-snapshot
  1951. image: ${etcd_image:-${KUBE_IMAGE_REPO}/etcd:3.4.13-0}
  1952. imagePullPolicy: IfNotPresent
  1953. args:
  1954. - -c
  1955. - etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt
  1956. --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key
  1957. snapshot save /backup/etcd-snapshot-\\\\\\\$(date +%Y-%m-%d_%H:%M:%S_%Z).db
  1958. && echo 'delete old backups' && { find /backup -type f -mtime +30 -exec rm -fv {} \\; || echo error; }
  1959. command:
  1960. - /usr/bin/bash
  1961. env:
  1962. - name: ETCDCTL_API
  1963. value: '3'
  1964. resources: {}
  1965. terminationMessagePath: /dev/termination-log
  1966. terminationMessagePolicy: File
  1967. volumeMounts:
  1968. - name: etcd-certs
  1969. mountPath: /etc/kubernetes/pki/etcd
  1970. readOnly: true
  1971. - name: backup
  1972. mountPath: /backup
  1973. - name: etc
  1974. mountPath: /etc
  1975. - name: bin
  1976. mountPath: /usr/bin
  1977. - name: lib64
  1978. mountPath: /lib64
  1979. dnsPolicy: ClusterFirst
  1980. hostNetwork: true
  1981. nodeSelector:
  1982. node-role.kubernetes.io/master: ''
  1983. tolerations:
  1984. - effect: NoSchedule
  1985. operator: Exists
  1986. restartPolicy: OnFailure
  1987. schedulerName: default-scheduler
  1988. securityContext: {}
  1989. terminationGracePeriodSeconds: 30
  1990. volumes:
  1991. - name: etcd-certs
  1992. hostPath:
  1993. path: /etc/kubernetes/pki/etcd
  1994. type: DirectoryOrCreate
  1995. - name: backup
  1996. hostPath:
  1997. path: /var/lib/etcd/backups
  1998. type: DirectoryOrCreate
  1999. - name: etc
  2000. hostPath:
  2001. path: /etc
  2002. - name: bin
  2003. hostPath:
  2004. path: /usr/bin
  2005. - name: lib64
  2006. hostPath:
  2007. path: /lib64
  2008. """
  2009. # shellcheck disable=SC2181
  2010. [[ "$?" == "0" ]] && log::access "[ops]" "etcd backup directory: /var/lib/etcd/backups"
  2011. command::exec "${MGMT_NODE}" "
  2012. jobname=\"etcd-snapshot-$(date +%s)\"
  2013. kubectl create job --from=cronjob/etcd-snapshot \${jobname} -n kube-system && \
  2014. kubectl wait --for=condition=complete job/\${jobname} -n kube-system
  2015. "
  2016. check::exit_code "$?" "ops" "trigger etcd backup"
  2017. }
  2018. # 重置节点
  2019. function reset::node() {
  2020. local host=$1
  2021. log::info "[reset]" "node $host"
  2022. command::exec "${host}" "
  2023. set +ex
  2024. cri_socket=\"\"
  2025. [ -S /var/run/crio/crio.sock ] && cri_socket=\"--cri-socket /var/run/crio/crio.sock\"
  2026. [ -S /run/containerd/containerd.sock ] && cri_socket=\"--cri-socket /run/containerd/containerd.sock\"
  2027. kubeadm reset -f \$cri_socket
  2028. [ -f \"\$(which kubelet)\" ] && { systemctl stop kubelet; find /var/lib/kubelet | xargs -n 1 findmnt -n -o TARGET -T | sort | uniq | xargs -r umount -v; yum remove -y kubeadm kubelet kubectl; }
  2029. [ -d /etc/kubernetes ] && rm -rf /etc/kubernetes/* /var/lib/kubelet/* /var/lib/etcd/* \$HOME/.kube /etc/cni/net.d/* /var/lib/dockershim/* /var/lib/cni/* /var/run/kubernetes/*
  2030. [ -f \"\$(which docker)\" ] && { docker rm -f -v \$(docker ps | grep kube | awk '{print \$1}'); systemctl stop docker; rm -rf \$HOME/.docker /etc/docker/* /var/lib/docker/*; yum remove -y docker; }
  2031. [ -f \"\$(which containerd)\" ] && { crictl rm \$(crictl ps -a -q); systemctl stop containerd; rm -rf /etc/containerd/* /var/lib/containerd/*; yum remove -y containerd.io; }
  2032. [ -f \"\$(which crio)\" ] && { crictl rm \$(crictl ps -a -q); systemctl stop crio; rm -rf /etc/crictl.yaml /etc/crio/* /var/run/crio/*; yum remove -y cri-o; }
  2033. [ -f \"\$(which runc)\" ] && { find /run/containers/ /var/lib/containers/ | xargs -n 1 findmnt -n -o TARGET -T | sort | uniq | xargs -r umount -v; rm -rf /var/lib/containers/* /var/run/containers/*; yum remove -y runc; }
  2034. [ -f \"\$(which haproxy)\" ] && { systemctl stop haproxy; rm -rf /etc/haproxy/*; yum remove -y haproxy; }
  2035. sed -i -e \"/$KUBE_APISERVER/d\" -e '/-worker-/d' -e '/-master-/d' /etc/hosts
  2036. sed -i '/## Kainstall managed start/,/## Kainstall managed end/d' /etc/security/limits.conf /etc/systemd/system.conf /etc/bashrc /etc/rc.local /etc/audit/rules.d/audit.rules
  2037. [ -d /var/lib/elasticsearch ] && rm -rf /var/lib/elasticsearch/*
  2038. [ -d /var/lib/longhorn ] && rm -rf /var/lib/longhorn/*
  2039. [ -d \"${OFFLINE_DIR:-/tmp/abc}\" ] && rm -rf \"${OFFLINE_DIR:-/tmp/abc}\"
  2040. for repo in kubernetes.repo docker-ce.repo devel_kubic_libcontainers_stable.repo elrepo.repo
  2041. do
  2042. [ -f /etc/yum.repos.d/\${repo} ] && rm -f /etc/yum.repos.d/\${repo}
  2043. done
  2044. ipvsadm --clear
  2045. iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
  2046. for int in kube-ipvs0 cni0 docker0 dummy0 flannel.1 cilium_host cilium_net cilium_vxlan lxc_health nodelocaldns
  2047. do
  2048. [ -d /sys/class/net/\${int} ] && ip link delete \${int}
  2049. done
  2050. modprobe -r ipip
  2051. echo done.
  2052. "
  2053. check::exit_code "$?" "reset" "$host: reset"
  2054. }
  2055. # 重置所有节点
  2056. function reset::cluster() {
  2057. local all_node=""
  2058. command::exec "${MGMT_NODE}" "
  2059. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {end}'
  2060. "
  2061. get::command_output "all_node" "$?"
  2062. all_node=$(echo "${WORKER_NODES} ${MASTER_NODES} ${all_node}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}')
  2063. for host in $all_node
  2064. do
  2065. reset::node "$host"
  2066. done
  2067. }
  2068. # 节点加载离线包
  2069. function offline::load() {
  2070. local role="${1:-}"
  2071. local hosts=""
  2072. if [[ "${role}" == "master" ]]; then
  2073. hosts="${MASTER_NODES}"
  2074. elif [[ "${role}" == "worker" ]]; then
  2075. hosts="${WORKER_NODES}"
  2076. fi
  2077. for host in ${hosts}
  2078. do
  2079. log::info "[offline]" "${role} ${host}: load offline file"
  2080. command::exec "${host}" "[[ ! -d \"${OFFLINE_DIR}\" ]] && { mkdir -pv \"${OFFLINE_DIR}\"; chmod 777 \"${OFFLINE_DIR}\"; } ||:"
  2081. check::exit_code "$?" "offline" "$host: mkdir offline dir" "exit"
  2082. if [[ "${UPGRADE_KERNEL_TAG:-}" == "1" ]]; then
  2083. command::scp "${host}" "${TMP_DIR}/packages/kernel/*" "${OFFLINE_DIR}"
  2084. check::exit_code "$?" "offline" "scp kernel file to $host" "exit"
  2085. else
  2086. log::info "[offline]" "${role} ${host}: copy offline file"
  2087. command::scp "${host}" "${TMP_DIR}/packages/kubeadm/*" "${OFFLINE_DIR}"
  2088. check::exit_code "$?" "offline" "scp kube file to $host" "exit"
  2089. command::scp "${host}" "${TMP_DIR}/packages/all/*" "${OFFLINE_DIR}"
  2090. check::exit_code "$?" "offline" "scp all file to $host" "exit"
  2091. if [[ "${role}" == "worker" ]]; then
  2092. command::scp "${host}" "${TMP_DIR}/packages/worker/*" "${OFFLINE_DIR}"
  2093. check::exit_code "$?" "offline" "scp worker file to $host" "exit"
  2094. fi
  2095. command::scp "${host}" "${TMP_DIR}/images/${role}.tgz" "${OFFLINE_DIR}"
  2096. check::exit_code "$?" "offline" "scp ${role} images to $host" "exit"
  2097. command::scp "${host}" "${TMP_DIR}/images/all.tgz" "${OFFLINE_DIR}"
  2098. check::exit_code "$?" "offline" "scp all images to $host" "exit"
  2099. fi
  2100. log::info "[offline]" "${role} ${host}: install package"
  2101. command::exec "${host}" "yum localinstall -y --skip-broken ${OFFLINE_DIR}/*.rpm"
  2102. check::exit_code "$?" "offline" "${role} ${host}: install package" "exit"
  2103. if [[ "${UPGRADE_KERNEL_TAG:-}" != "1" ]]; then
  2104. command::exec "${host}" "
  2105. set -e
  2106. for target in firewalld python-firewall firewalld-filesystem iptables; do
  2107. systemctl stop \$target &>/dev/null || true
  2108. systemctl disable \$target &>/dev/null || true
  2109. done
  2110. systemctl start docker && \
  2111. cd ${OFFLINE_DIR} && \
  2112. gzip -d -c ${1}.tgz | docker load && gzip -d -c all.tgz | docker load
  2113. "
  2114. check::exit_code "$?" "offline" "$host: load images" "exit"
  2115. fi
  2116. command::exec "${host}" "rm -rf ${OFFLINE_DIR:-/tmp/abc}"
  2117. check::exit_code "$?" "offline" "$host: clean offline file"
  2118. done
  2119. command::scp "${MGMT_NODE}" "${TMP_DIR}/manifests" "${OFFLINE_DIR}"
  2120. check::exit_code "$?" "offline" "scp manifests file to ${MGMT_NODE}" "exit"
  2121. command::scp "${MGMT_NODE}" "${TMP_DIR}/bins" "${OFFLINE_DIR}"
  2122. check::exit_code "$?" "offline" "scp bins file to ${MGMT_NODE}" "exit"
  2123. }
  2124. # 集群节点加载离线包
  2125. function offline::cluster() {
  2126. [ ! -f "${OFFLINE_FILE}" ] && { log::error "[offline]" "not found ${OFFLINE_FILE}" ; exit 1; }
  2127. log::info "[offline]" "Unzip offline package on local."
  2128. tar zxf "${OFFLINE_FILE}" -C "${TMP_DIR}/"
  2129. check::exit_code "$?" "offline" "Unzip offline package"
  2130. offline::load "master"
  2131. offline::load "worker"
  2132. }
  2133. # 初始化集群
  2134. function init::cluster() {
  2135. MGMT_NODE=$(echo "${MASTER_NODES}" | awk '{print $1}')
  2136. # 加载离线包
  2137. [[ "${OFFLINE_TAG:-}" == "1" ]] && offline::cluster
  2138. # 1. 初始化节点
  2139. init::node
  2140. # 2. 安装包
  2141. install::package
  2142. # 3. 初始化kubeadm
  2143. kubeadm::init
  2144. # 4. 加入集群
  2145. kubeadm::join
  2146. # 5. 添加network
  2147. add::network
  2148. # 6. 安装addon
  2149. add::addon
  2150. # 7. 添加ingress
  2151. add::ingress
  2152. # 8. 添加storage
  2153. [[ "${STORAGE_TAG:-}" == "1" ]] && add::storage
  2154. # 9. 添加web ui
  2155. add::ui
  2156. # 10. 添加monitor
  2157. [[ "${MONITOR_TAG:-}" == "1" ]] && add::monitor
  2158. # 11. 添加log
  2159. [[ "${LOG_TAG:-}" == "1" ]] && add::log
  2160. # 12. 运维操作
  2161. add::ops
  2162. # 13. 查看集群状态
  2163. kube::status
  2164. }
  2165. # 添加节点
  2166. function add::node() {
  2167. # 加载离线包
  2168. [[ "${OFFLINE_TAG:-}" == "1" ]] && offline::cluster
  2169. # KUBE_VERSION未指定时,获取集群的版本
  2170. if [[ "${KUBE_VERSION}" == "" || "${KUBE_VERSION}" == "latest" ]]; then
  2171. command::exec "${MGMT_NODE}" "
  2172. kubectl get node --selector='node-role.kubernetes.io/master' -o jsonpath='{range.items[*]}{.status.nodeInfo.kubeletVersion } {end}' | awk -F'v| ' '{print \$2}'
  2173. "
  2174. get::command_output "KUBE_VERSION" "$?" "exit"
  2175. fi
  2176. # 1. 初始化节点
  2177. init::add_node
  2178. # 2. 安装包
  2179. install::package
  2180. # 3. 加入集群
  2181. kubeadm::join
  2182. # 4. haproxy添加apiserver
  2183. config::haproxy_backend "add"
  2184. # 5. 更新 etcd snapshot 副本
  2185. config::etcd_snapshot
  2186. # 6. 查看集群状态
  2187. kube::status
  2188. }
  2189. # 删除节点
  2190. function del::node() {
  2191. config::haproxy_backend "remove"
  2192. local cluster_nodes=""
  2193. local del_hosts_cmd=""
  2194. command::exec "${MGMT_NODE}" "
  2195. kubectl get node -o jsonpath='{range.items[*]}{.status.addresses[?(@.type==\"InternalIP\")].address} {.metadata.name }\\n{end}'
  2196. "
  2197. get::command_output "cluster_nodes" "$?" exit
  2198. for host in $MASTER_NODES
  2199. do
  2200. command::exec "${MGMT_NODE}" "
  2201. etcd_pod=\$(kubectl -n kube-system get pods -l component=etcd --field-selector=status.phase=Running -o jsonpath='{\$.items[0].metadata.name}')
  2202. etcd_node=\$(kubectl -n kube-system exec \$etcd_pod -- sh -c \"export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/kubernetes/pki/etcd/ca.crt ETCDCTL_CERT=/etc/kubernetes/pki/etcd/server.crt ETCDCTL_KEY=/etc/kubernetes/pki/etcd/server.key ETCDCTL_ENDPOINTS=https://127.0.0.1:2379; etcdctl member list\"| grep $host | awk -F, '{print \$1}')
  2203. echo \"\$etcd_pod \$etcd_node\"
  2204. kubectl -n kube-system exec \$etcd_pod -- sh -c \"export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/kubernetes/pki/etcd/ca.crt ETCDCTL_CERT=/etc/kubernetes/pki/etcd/server.crt ETCDCTL_KEY=/etc/kubernetes/pki/etcd/server.key ETCDCTL_ENDPOINTS=https://127.0.0.1:2379; etcdctl member remove \$etcd_node; etcdctl member list\"
  2205. "
  2206. check::exit_code "$?" "del" "remove $host etcd member"
  2207. done
  2208. for host in $MASTER_NODES $WORKER_NODES
  2209. do
  2210. log::info "[del]" "node $host"
  2211. local node_name; node_name=$(echo -ne "${cluster_nodes}" | grep "${host}" | awk '{print $2}')
  2212. if [[ "${node_name}" == "" ]]; then
  2213. log::warning "[del]" "node $host not found."
  2214. read -r -t 10 -n 1 -p "Do you need to reset the node (y/n)? " answer
  2215. [[ -z "$answer" || "$answer" != "y" ]] && exit || echo
  2216. else
  2217. log::info "[del]" "drain $host"
  2218. command::exec "${MGMT_NODE}" "kubectl drain $node_name --force --ignore-daemonsets --delete-local-data"
  2219. check::exit_code "$?" "del" "$host: drain"
  2220. log::info "[del]" "delete node $host"
  2221. command::exec "${MGMT_NODE}" "kubectl delete node $node_name"
  2222. check::exit_code "$?" "del" "$host: delete"
  2223. sleep 3
  2224. fi
  2225. reset::node "$host"
  2226. del_hosts_cmd="${del_hosts_cmd}\nsed -i "/$host/d" /etc/hosts"
  2227. done
  2228. for host in $(echo -ne "${cluster_nodes}" | awk '{print $1}')
  2229. do
  2230. log::info "[del]" "$host: remove del node hostname resolution"
  2231. command::exec "${host}" "
  2232. $(echo -ne "${del_hosts_cmd}")
  2233. "
  2234. check::exit_code "$?" "del" "remove del node hostname resolution"
  2235. done
  2236. [ "$MASTER_NODES" != "" ] && config::etcd_snapshot
  2237. kube::status
  2238. }
  2239. # 升级集群
  2240. function upgrade::cluster() {
  2241. log::info "[upgrade]" "upgrade to $KUBE_VERSION"
  2242. log::info "[upgrade]" "backup cluster"
  2243. add::ops
  2244. local stable_version="2"
  2245. command::exec "127.0.0.1" "wget https://storage.googleapis.com/kubernetes-release/release/stable.txt -q -O -"
  2246. get::command_output "stable_version" "$?" && stable_version="${stable_version#v}"
  2247. local node_hosts="$MASTER_NODES $WORKER_NODES"
  2248. if [[ "$node_hosts" == " " ]]; then
  2249. command::exec "${MGMT_NODE}" "
  2250. kubectl get node -o jsonpath='{range.items[*]}{.metadata.name } {end}'
  2251. "
  2252. get::command_output "node_hosts" "$?" exit
  2253. fi
  2254. local skip_plan=${SKIP_UPGRADE_PLAN,,}
  2255. for host in ${node_hosts}
  2256. do
  2257. log::info "[upgrade]" "node: $host"
  2258. local local_version=""
  2259. command::exec "${host}" "kubectl version --client --short | awk '{print \$3}'"
  2260. get::command_output "local_version" "$?" && local_version="${local_version#v}"
  2261. if [[ "${KUBE_VERSION}" != "latest" ]]; then
  2262. if [[ "${KUBE_VERSION}" == "${local_version}" ]];then
  2263. log::warning "[check]" "The specified version(${KUBE_VERSION}) is consistent with the local version(${local_version})!"
  2264. continue
  2265. fi
  2266. if [[ $(utils::version_to_number "$KUBE_VERSION") -lt $(utils::version_to_number "${local_version}") ]];then
  2267. log::warning "[check]" "The specified version($KUBE_VERSION) is less than the local version(${local_version})!"
  2268. continue
  2269. fi
  2270. if [[ $(utils::version_to_number "$KUBE_VERSION") -gt $(utils::version_to_number "${stable_version}") ]];then
  2271. log::warning "[check]" "The specified version($KUBE_VERSION) is more than the stable version(${stable_version})!"
  2272. continue
  2273. fi
  2274. else
  2275. if [[ $(utils::version_to_number "${local_version}") -ge $(utils::version_to_number "${stable_version}") ]];then
  2276. log::warning "[check]" "The local version($local_version) is greater or equal to the stable version(${stable_version})!"
  2277. continue
  2278. fi
  2279. fi
  2280. command::exec "${MGMT_NODE}" "kubectl drain ${host} --ignore-daemonsets --delete-local-data"
  2281. check::exit_code "$?" "upgrade" "drain ${host} node" "exit"
  2282. sleep 5
  2283. if [[ "${skip_plan}" == "false" ]]; then
  2284. command::exec "${host}" "$(declare -f script::upgrage_kube); script::upgrage_kube 'init' '$KUBE_VERSION'"
  2285. check::exit_code "$?" "upgrade" "plan and upgrade cluster on ${host}" "exit"
  2286. command::exec "${host}" "$(declare -f utils::retry); utils::retry 10 kubectl get node"
  2287. check::exit_code "$?" "upgrade" "${host}: upgrade" "exit"
  2288. skip_plan=true
  2289. else
  2290. command::exec "${host}" "$(declare -f script::upgrage_kube); script::upgrage_kube 'node' '$KUBE_VERSION'"
  2291. check::exit_code "$?" "upgrade" "upgrade ${host} node" "exit"
  2292. fi
  2293. command::exec "${MGMT_NODE}" "kubectl wait --for=condition=Ready node/${host} --timeout=120s"
  2294. check::exit_code "$?" "upgrade" "${host} ready"
  2295. sleep 5
  2296. command::exec "${MGMT_NODE}" "$(declare -f utils::retry); utils::retry 6 kubectl uncordon ${host}"
  2297. check::exit_code "$?" "upgrade" "uncordon ${host} node"
  2298. sleep 5
  2299. done
  2300. kube::status
  2301. }
  2302. # 脚本文件更新
  2303. function update::self {
  2304. log::info "[update]" "download kainstall script to $0"
  2305. command::exec "127.0.0.1" "
  2306. wget --timeout=10 --waitretry=3 --tries=5 --retry-connrefused https://cdn.jsdelivr.net/gh/lework/kainstall@master/kainstall-centos.sh -O /tmp/kainstall-centos.sh || exit 1
  2307. /bin/mv -fv /tmp/kainstall-centos.sh \"$0\"
  2308. chmod +x \"$0\"
  2309. "
  2310. check::exit_code "$?" "update" "kainstall script"
  2311. }
  2312. # 数据处理及限制
  2313. function transform::data {
  2314. MASTER_NODES=$(echo "${MASTER_NODES}" | tr ',' ' ')
  2315. WORKER_NODES=$(echo "${WORKER_NODES}" | tr ',' ' ')
  2316. if ! utils::is_element_in_array "$KUBE_CRI" docker containerd cri-o ; then
  2317. log::error "[limit]" "$KUBE_CRI is not supported, only [docker,containerd,cri-o]"
  2318. exit 1
  2319. fi
  2320. [[ "$KUBE_CRI" != "docker" && "${OFFLINE_TAG:-}" == "1" ]] && { log::error "[limit]" "$KUBE_CRI is not supported offline, only docker"; exit 1; }
  2321. [[ "$KUBE_CRI" == "containerd" && "${KUBE_CRI_ENDPOINT}" == "/var/run/dockershim.sock" ]] && KUBE_CRI_ENDPOINT="unix:///run/containerd/containerd.sock"
  2322. [[ "$KUBE_CRI" == "cri-o" && "${KUBE_CRI_ENDPOINT}" == "/var/run/dockershim.sock" ]] && KUBE_CRI_ENDPOINT="unix:///var/run/crio/crio.sock"
  2323. kubelet_nodeRegistration="nodeRegistration:
  2324. criSocket: ${KUBE_CRI_ENDPOINT:-/var/run/dockershim.sock}
  2325. kubeletExtraArgs:
  2326. runtime-cgroups: /system.slice/${KUBE_CRI//-/}.service
  2327. pod-infra-container-image: ${KUBE_IMAGE_REPO}/pause:${PAUSE_VERSION:-3.6}
  2328. "
  2329. }
  2330. # 使用帮助
  2331. function help::usage {
  2332. cat << EOF
  2333. Install kubernetes cluster using kubeadm.
  2334. Usage:
  2335. $(basename "$0") [command]
  2336. Available Commands:
  2337. init Init Kubernetes cluster.
  2338. reset Reset Kubernetes cluster.
  2339. add Add nodes to the cluster.
  2340. del Remove node from the cluster.
  2341. renew-cert Renew all available certificates.
  2342. upgrade Upgrading kubeadm clusters.
  2343. update Update script file.
  2344. Flag:
  2345. -m,--master master node, default: ''
  2346. -w,--worker work node, default: ''
  2347. -u,--user ssh user, default: ${SSH_USER}
  2348. -p,--password ssh password
  2349. --private-key ssh private key
  2350. -P,--port ssh port, default: ${SSH_PORT}
  2351. -v,--version kube version, default: ${KUBE_VERSION}
  2352. -n,--network cluster network, choose: [flannel,calico,cilium], default: ${KUBE_NETWORK}
  2353. -i,--ingress ingress controller, choose: [nginx,traefik], default: ${KUBE_INGRESS}
  2354. -ui,--ui cluster web ui, choose: [dashboard,kubesphere], default: ${KUBE_UI}
  2355. -a,--addon cluster add-ons, choose: [metrics-server,nodelocaldns], default: ${KUBE_ADDON}
  2356. -M,--monitor cluster monitor, choose: [prometheus]
  2357. -l,--log cluster log, choose: [elasticsearch]
  2358. -s,--storage cluster storage, choose: [rook,longhorn]
  2359. --cri cri tools, choose: [docker,containerd,cri-o], default: ${KUBE_CRI}
  2360. --cri-version cri version, default: ${KUBE_CRI_VERSION}
  2361. --cri-endpoint cri endpoint, default: ${KUBE_CRI_ENDPOINT}
  2362. -U,--upgrade-kernel upgrade kernel
  2363. -of,--offline-file specify the offline package file to load
  2364. --10years the certificate period is 10 years.
  2365. --sudo sudo mode
  2366. --sudo-user sudo user
  2367. --sudo-password sudo user password
  2368. Example:
  2369. [init cluster]
  2370. $0 init \\
  2371. --master 192.168.77.130,192.168.77.131,192.168.77.132 \\
  2372. --worker 192.168.77.133,192.168.77.134,192.168.77.135 \\
  2373. --user root \\
  2374. --password 123456 \\
  2375. --version 1.20.4
  2376. [reset cluster]
  2377. $0 reset \\
  2378. --user root \\
  2379. --password 123456
  2380. [add node]
  2381. $0 add \\
  2382. --master 192.168.77.140,192.168.77.141 \\
  2383. --worker 192.168.77.143,192.168.77.144 \\
  2384. --user root \\
  2385. --password 123456 \\
  2386. --version 1.20.4
  2387. [del node]
  2388. $0 del \\
  2389. --master 192.168.77.140,192.168.77.141 \\
  2390. --worker 192.168.77.143,192.168.77.144 \\
  2391. --user root \\
  2392. --password 123456
  2393. [other]
  2394. $0 renew-cert --user root --password 123456
  2395. $0 upgrade --version 1.20.4 --user root --password 123456
  2396. $0 update
  2397. $0 add --ingress traefik
  2398. $0 add --monitor prometheus
  2399. $0 add --log elasticsearch
  2400. $0 add --storage rook
  2401. $0 add --ui dashboard
  2402. $0 add --addon nodelocaldns
  2403. EOF
  2404. exit 1
  2405. }
  2406. ######################################################################################################
  2407. # main
  2408. ######################################################################################################
  2409. [ "$#" == "0" ] && help::usage
  2410. while [ "${1:-}" != "" ]; do
  2411. case $1 in
  2412. init ) INIT_TAG=1
  2413. ;;
  2414. reset ) RESET_TAG=1
  2415. ;;
  2416. add ) ADD_TAG=1
  2417. ;;
  2418. del ) DEL_TAG=1
  2419. ;;
  2420. renew-cert ) RENEW_CERT_TAG=1
  2421. ;;
  2422. upgrade ) UPGRADE_TAG=1
  2423. ;;
  2424. update ) UPDATE_TAG=1
  2425. ;;
  2426. -m | --master ) shift
  2427. MASTER_NODES=${1:-$MASTER_NODES}
  2428. ;;
  2429. -w | --worker ) shift
  2430. WORKER_NODES=${1:-$WORKER_NODES}
  2431. ;;
  2432. -u | --user ) shift
  2433. SSH_USER=${1:-$SSH_USER}
  2434. ;;
  2435. -p | --password ) shift
  2436. SSH_PASSWORD=${1:-$SSH_PASSWORD}
  2437. ;;
  2438. --private-key ) shift
  2439. SSH_PRIVATE_KEY=${1:-$SSH_SSH_PRIVATE_KEY}
  2440. ;;
  2441. -P | --port ) shift
  2442. SSH_PORT=${1:-$SSH_PORT}
  2443. ;;
  2444. -v | --version ) shift
  2445. KUBE_VERSION=${1:-$KUBE_VERSION}
  2446. ;;
  2447. -n | --network ) shift
  2448. NETWORK_TAG=1
  2449. KUBE_NETWORK=${1:-$KUBE_NETWORK}
  2450. ;;
  2451. -i | --ingress ) shift
  2452. INGRESS_TAG=1
  2453. KUBE_INGRESS=${1:-$KUBE_INGRESS}
  2454. ;;
  2455. -M | --monitor ) shift
  2456. MONITOR_TAG=1
  2457. KUBE_MONITOR=${1:-$KUBE_MONITOR}
  2458. ;;
  2459. -l | --log ) shift
  2460. LOG_TAG=1
  2461. KUBE_LOG=${1:-$KUBE_LOG}
  2462. ;;
  2463. -s | --storage ) shift
  2464. STORAGE_TAG=1
  2465. KUBE_STORAGE=${1:-$KUBE_STORAGE}
  2466. ;;
  2467. -ui | --ui ) shift
  2468. UI_TAG=1
  2469. KUBE_UI=${1:-$KUBE_UI}
  2470. ;;
  2471. -a | --addon ) shift
  2472. ADDON_TAG=1
  2473. KUBE_ADDON=${1:-$KUBE_ADDON}
  2474. ;;
  2475. --cri ) shift
  2476. KUBE_CRI=${1:-$KUBE_CRI}
  2477. ;;
  2478. --cri-version ) shift
  2479. KUBE_CRI_VERSION=${1:-$KUBE_CRI_VERSION}
  2480. ;;
  2481. --cri-endpoint ) shift
  2482. KUBE_CRI_ENDPOINT=${1:-$KUBE_CRI_ENDPOINT}
  2483. ;;
  2484. -U | --upgrade-kernel ) UPGRADE_KERNEL_TAG=1
  2485. ;;
  2486. -of | --offline-file ) shift
  2487. OFFLINE_TAG=1
  2488. OFFLINE_FILE=${1:-$OFFLINE_FILE}
  2489. ;;
  2490. --10years ) CERT_YEAR_TAG=1
  2491. ;;
  2492. --sudo ) SUDO_TAG=1
  2493. ;;
  2494. --sudo-user ) shift
  2495. SUDO_USER=${1:-$SUDO_USER}
  2496. ;;
  2497. --sudo-password ) shift
  2498. SUDO_PASSWORD=${1:-}
  2499. ;;
  2500. * ) help::usage
  2501. exit 1
  2502. esac
  2503. shift
  2504. done
  2505. # 开始
  2506. log::info "[start]" "bash $0 ${SCRIPT_PARAMETER//${SSH_PASSWORD:-${SUDO_PASSWORD:-}}/zzzzzz}"
  2507. # 数据处理
  2508. transform::data
  2509. # 预检
  2510. check::preflight
  2511. # 动作
  2512. if [[ "${INIT_TAG:-}" == "1" ]]; then
  2513. [[ "$MASTER_NODES" == "" ]] && MASTER_NODES="127.0.0.1"
  2514. init::cluster
  2515. elif [[ "${ADD_TAG:-}" == "1" ]]; then
  2516. [[ "${NETWORK_TAG:-}" == "1" ]] && { add::network; add=1; }
  2517. [[ "${INGRESS_TAG:-}" == "1" ]] && { add::ingress; add=1; }
  2518. [[ "${STORAGE_TAG:-}" == "1" ]] && { add::storage; add=1; }
  2519. [[ "${MONITOR_TAG:-}" == "1" ]] && { add::monitor; add=1; }
  2520. [[ "${LOG_TAG:-}" == "1" ]] && { add::log; add=1; }
  2521. [[ "${UI_TAG:-}" == "1" ]] && { add::ui; add=1; }
  2522. [[ "${ADDON_TAG:-}" == "1" ]] && { add::addon; add=1; }
  2523. [[ "$MASTER_NODES" != "" || "$WORKER_NODES" != "" ]] && { add::node; add=1; }
  2524. [[ "${add:-}" != "1" ]] && help::usage
  2525. elif [[ "${DEL_TAG:-}" == "1" ]]; then
  2526. if [[ "$MASTER_NODES" != "" || "$WORKER_NODES" != "" ]]; then del::node; else help::usage; fi
  2527. elif [[ "${RESET_TAG:-}" == "1" ]]; then
  2528. reset::cluster
  2529. elif [[ "${RENEW_CERT_TAG:-}" == "1" ]]; then
  2530. cert::renew
  2531. elif [[ "${UPGRADE_TAG:-}" == "1" ]]; then
  2532. upgrade::cluster
  2533. elif [[ "${UPDATE_TAG:-}" == "1" ]]; then
  2534. update::self
  2535. else
  2536. help::usage
  2537. fi