Linux kernel Kallsyms 构建与解析

Kallsyms 是 Linux 内核镜像自带的符号表，包含了大多数内核函数、变量、标签的地址，因此是分析 Linux 内核镜像的重要工具。本文将首先介绍 Kallsyms 的结构在内核构建过程中的生成过程，然后以 vmlinux-to-elf 为例，讨论如何解析 vmlinux 以找到 kallsyms 结构的位置。

构建

生成 kallsyms 在内核中的布局，是靠 scripts/kallsyms.c 这个程序，它会生成一个汇编源文件（out/.tmp_vmlinux.kallsyms2.S），其中包含了 kallsyms 的必要结构。

本文参考 android common kernel android14-6.1 的 scripts/kallsyms.c

由于内核包含数量巨大的符号（约 10w+ 符号，500w+ 字符），直接明文存储太浪费内存，因此需要进行压缩操作。

kallsyms.c 需要建立一个符号表，长度为 256 ，因为 0-255 都编码为组成符号的某个子串。

为了最大程度节省空间，这些子串必须是常见的；同时为了能编码所有的符号，这 256 个字符所编码的子串集合必须包含所有内核符号的字符集合（也就是说，为了编码更多的子串，这 256 个字符对应子串的字符集合要等于（而非真包含）内核符号的字符集合）

入口

看到主入口，主要分为这几个步骤：

   // 1：读取必要符号（如果无 all kallsyms ，要过滤一些符号）
read_map(argv[optind]);
shrink_table();
if (absolute_percpu)
	make_percpus_absolute();
   // 2. 排序（优先按照地址升序）
sort_symbols();
if (base_relative)
	record_relative_base();
   // 3. 构造符号单词表
optimize_token_table();
   // 4. 写汇编
write_src();

需要注意读取符号的时候同时算上了它的 type （一个字符），从而可以一起参与压缩。type 在符号的第 0 个字符，len 是包含 type 和符号名字，不含结尾 \0 的长度。

构建单词表和优化符号

我们先关注构造符号单词表的过程：

static void optimize_token_table(void)
{
	build_initial_tok_table();

	insert_real_symbols_in_table();

	optimize_result();
}

/* do the initial token count */
static void build_initial_tok_table(void)
{
	unsigned int i;

	for (i = 0; i < table_cnt; i++)
		learn_symbol(table[i]->sym, table[i]->len);
}

learn_symbol 用于填充 token_profit ，表示符号中每一个二字词的出现频数。由于符号字符是 0-255 ，因此这个表大小是 256*256 = 0x10000

static int token_profit[0x10000];

/* count all the possible tokens in a symbol */
static void learn_symbol(const unsigned char *symbol, int len)
{
	int i;

	for (i = 0; i < len - 1; i++)
		token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++;
}

接下来 insert_real_symbols_in_table 初始填充 best_table ，这个 best_table 每个元素就对应字符编码成的 token ，这里只用了两个字符就能存储整个 token 表，具体原因稍后会解释。best_table_len 就是每个字符所编码的 token 长度。

初始填充的方法很简单，就是将符号表的每个字符都填入，因此填充完成后符号表的字符集合的每一个元素都在 best_table 中，字符编码成它本身，长度为 1 ，此后不会改变，因为要保证这个编码表能完整表示符号表。

/* the table that holds the result of the compression */
static unsigned char best_table[256][2];
static unsigned char best_table_len[256];

/* start by placing the symbols that are actually used on the table */
static void insert_real_symbols_in_table(void)
{
	unsigned int i, j, c;

	for (i = 0; i < table_cnt; i++) {
		for (j = 0; j < table[i]->len; j++) {
			c = table[i]->sym[j];
			best_table[c][0]=c;
			best_table_len[c]=1;
		}
	}
}

最后就是 optimize_result ，它从后往前寻找 best_table_len 中的空位（也就是初始化没填写的字符，或者说不在字符集里的字符），将其编码为频数最高的二字词，之后替换符号字符中所有的这个二字词为这个新的编码，重新统计频数，如此往复直到二字词频数都为 0 或者 best_table_len 已经填满。

/* this is the core of the algorithm: calculate the "best" table */
static void optimize_result(void)
{
	int i, best;

	/* using the '\0' symbol last allows compress_symbols to use standard
	 * fast string functions */
	for (i = 255; i >= 0; i--) {

		/* if this table slot is empty (it is not used by an actual
		 * original char code */
		if (!best_table_len[i]) {

			/* find the token with the best profit value */
			best = find_best_token();
			if (token_profit[best] == 0)
				break;

			/* place it in the "best" table */
			best_table_len[i] = 2;
			best_table[i][0] = best & 0xFF;
			best_table[i][1] = (best >> 8) & 0xFF;

			/* replace this token in all the valid symbols */
			compress_symbols(best_table[i], i);
		}
	}
}

/* search the token with the maximum profit */
static int find_best_token(void)
{
	int i, best, bestprofit;

	bestprofit=-10000;
	best = 0;

	for (i = 0; i < 0x10000; i++) {
		if (token_profit[i] > bestprofit) {
			best = i;
			bestprofit = token_profit[i];
		}
	}
	return best;
}

compress_symbols 就是对符号表的每个符号，如果包含这个频数最高的二字词，则忘掉这个符号原来的频数，将符号内容中所有的这个二字词替换成新编码，随后重新计算频数。这样一来，填充下一个符号就可以用新的频数，同时也允许新的二字词继续计算频数（这就是说后来的字符所编码的二字词可能会包含已有的二字词编码）

/* replace a given token in all the valid symbols. Use the sampled symbols
 * to update the counts */
static void compress_symbols(const unsigned char *str, int idx)
{
	unsigned int i, len, size;
	unsigned char *p1, *p2;

	for (i = 0; i < table_cnt; i++) {

		len = table[i]->len;
		p1 = table[i]->sym;

		/* find the token on the symbol */
		p2 = find_token(p1, len, str);
		if (!p2) continue;

		/* decrease the counts for this symbol's tokens */
		forget_symbol(table[i]->sym, len);

		size = len;

		do {
			*p2 = idx;
			p2++;
			size -= (p2 - p1);
			memmove(p2, p2 + 1, size);
			p1 = p2;
			len--;

			if (size < 2) break;

			/* find the token on the symbol */
			p2 = find_token(p1, size, str);

		} while (p2);

		table[i]->len = len;

		/* increase the counts for this symbol's new tokens */
		learn_symbol(table[i]->sym, len);
	}
}

/* decrease the count for all the possible tokens in a symbol */
static void forget_symbol(const unsigned char *symbol, int len)
{
	int i;

	for (i = 0; i < len - 1; i++)
		token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--;
}

整个过程完成之后，单词表构建完成，且所有的符号名字都转为用新的单词表进行编码。

布局

接下来写入汇编进行布局

static void write_src(void)
{
	unsigned int i, k, off;
	unsigned int best_idx[256];
	unsigned int *markers;
	char buf[KSYM_NAME_LEN];

	printf("#include <asm/bitsperlong.h>\n");
	printf("#if BITS_PER_LONG == 64\n");
	printf("#define PTR .quad\n");
	printf("#define ALGN .balign 8\n");
	printf("#else\n");
	printf("#define PTR .long\n");
	printf("#define ALGN .balign 4\n");
	printf("#endif\n");

	printf("\t.section .rodata, \"a\"\n");

	if (!base_relative)
		output_label("kallsyms_addresses");
	else
		output_label("kallsyms_offsets");

	for (i = 0; i < table_cnt; i++) {
		if (base_relative) {
			/*
			 * Use the offset relative to the lowest value
			 * encountered of all relative symbols, and emit
			 * non-relocatable fixed offsets that will be fixed
			 * up at runtime.
			 */

			long long offset;
			int overflow;

			if (!absolute_percpu) {
				offset = table[i]->addr - relative_base;
				overflow = (offset < 0 || offset > UINT_MAX);
			} else if (symbol_absolute(table[i])) {
				offset = table[i]->addr;
				overflow = (offset < 0 || offset > INT_MAX);
			} else {
				offset = relative_base - table[i]->addr - 1;
				overflow = (offset < INT_MIN || offset >= 0);
			}
			if (overflow) {
				fprintf(stderr, "kallsyms failure: "
					"%s symbol value %#llx out of range in relative mode\n",
					symbol_absolute(table[i]) ? "absolute" : "relative",
					table[i]->addr);
				exit(EXIT_FAILURE);
			}
			printf("\t.long\t%#x\n", (int)offset);
		} else if (!symbol_absolute(table[i])) {
			output_address(table[i]->addr);
		} else {
			printf("\tPTR\t%#llx\n", table[i]->addr);
		}
	}
	printf("\n");

	if (base_relative) {
		output_label("kallsyms_relative_base");
		output_address(relative_base);
		printf("\n");
	}

	output_label("kallsyms_num_syms");
	printf("\t.long\t%u\n", table_cnt);
	printf("\n");

	/* table of offset markers, that give the offset in the compressed stream
	 * every 256 symbols */
	markers = malloc(sizeof(unsigned int) * ((table_cnt + 255) / 256));
	if (!markers) {
		fprintf(stderr, "kallsyms failure: "
			"unable to allocate required memory\n");
		exit(EXIT_FAILURE);
	}

	output_label("kallsyms_names");
	off = 0;
	for (i = 0; i < table_cnt; i++) {
		if ((i & 0xFF) == 0)
			markers[i >> 8] = off;
		table[i]->seq = i;

		/* There cannot be any symbol of length zero. */
		if (table[i]->len == 0) {
			fprintf(stderr, "kallsyms failure: "
				"unexpected zero symbol length\n");
			exit(EXIT_FAILURE);
		}

		/* Only lengths that fit in up-to-two-byte ULEB128 are supported. */
		if (table[i]->len > 0x3FFF) {
			fprintf(stderr, "kallsyms failure: "
				"unexpected huge symbol length\n");
			exit(EXIT_FAILURE);
		}

		/* Encode length with ULEB128. */
		if (table[i]->len <= 0x7F) {
			/* Most symbols use a single byte for the length. */
			printf("\t.byte 0x%02x", table[i]->len);
			off += table[i]->len + 1;
		} else {
			/* "Big" symbols use two bytes. */
			printf("\t.byte 0x%02x, 0x%02x",
				(table[i]->len & 0x7F) | 0x80,
				(table[i]->len >> 7) & 0x7F);
			off += table[i]->len + 2;
		}
		for (k = 0; k < table[i]->len; k++)
			printf(", 0x%02x", table[i]->sym[k]);
		printf("\n");
	}
	printf("\n");

	output_label("kallsyms_markers");
	for (i = 0; i < ((table_cnt + 255) >> 8); i++)
		printf("\t.long\t%u\n", markers[i]);
	printf("\n");

	free(markers);

	sort_symbols_by_name();
	output_label("kallsyms_seqs_of_names");
	for (i = 0; i < table_cnt; i++)
		printf("\t.byte 0x%02x, 0x%02x, 0x%02x\n",
			(unsigned char)(table[i]->seq >> 16),
			(unsigned char)(table[i]->seq >> 8),
			(unsigned char)(table[i]->seq >> 0));
	printf("\n");

	output_label("kallsyms_token_table");
	off = 0;
	for (i = 0; i < 256; i++) {
		best_idx[i] = off;
		expand_symbol(best_table[i], best_table_len[i], buf);
		printf("\t.asciz\t\"%s\"\n", buf);
		off += strlen(buf) + 1;
	}
	printf("\n");

	output_label("kallsyms_token_index");
	for (i = 0; i < 256; i++)
		printf("\t.short\t%d\n", best_idx[i]);
	printf("\n");
}

按照顺序依次是（每个 label 位置都是按照处理器字长对齐的，如 64 位就是 8 字节对齐）：

label	内容长度	说明
kallsyms_addresses 或 kallsyms_offsets + kallsyms_relative_base	kallsyms_num_syms×sizeof(PTR) 或 kallsyms_num_syms×4+sizeof(PTR)	每个符号的地址或相对偏移，一般用后者，如果为 kallsyms_offsets 则存在 kallsyms_relative_base （即地址最小的非绝对符号的地址）。偏移用4字节，地址用处理器字长
kallsyms_num_syms	4	符号个数
kallsyms_names	取决于所有符号编码内容	每个符号的优化编码后的名字，长度前缀+内容，长度用 ULEB128 编码
kallsyms_markers	4×((kallsyms_num_syms+255)//256)	每 256 个符号相对 kallsyms_names 的偏移，第 0 个元素是 0 ，第一个元素表示第 0x100 个符号的位置，以此类推
kallsyms_seqs_of_names	kallsyms_num_syms×3	每个符号按照名字排序后对应的 index ，这个 index 是最开始按照符号地址排序的 index 。只用了 3 个字节编码，也就是可以编码 0x10_0000 = 1048576 约 100 万个符号
kallsyms_token_table	256个单词总长度	单词表，每个单词都是 0 结尾
kallsyms_token_index	2×256	256 个字符所编码的单词相对 kallsyms_token_table 的偏移（short，第 0 个元素是 0）

随着内核版本变更，这个布局也可能发生变化，如 6.4 改变了 kallsyms_offsets 的顺序，移动到了 kallsyms_token_index 下面。

链接到 vmlinux

参考 scripts/link-vmlinux.sh ，kallsyms 添加到链接 vmlinux 实际上经过了三次 ld ：

第一次生成临时 vmlinux ：ld vmlinux.o -> .tmp_vmlinux.kallsyms1 ，生成 .tmp_vmlinux.kallsyms1{,.map,.S,.o} 也就是依次得到符号表、生成汇编、编译。此时的 .tmp_vmlinux.kallsyms1 是不含 kallsyms 的 vmlinux
第二次生成临时 vmlinux ：ld vmlinux.o .tmp_vmlinux.kallsyms1.o -> .tmp_vmlinux.kallsyms2 ，生成 .tmp_vmlinux.kallsyms2{,.map,.S,.o} 将上一次的 .tmp_vmlinux.kallsyms1.o 再次与 vmlinux.o 链接，在 kallsyms 生成 .S 过程中确保第一次和第二次不会增加符号，因此 .tmp_vmlinux.kallsyms2.o 和 tmp_vmlinux.kallsyms1.o 除了地址稍微有出入，其余布局相同，且 .tmp_vmlinux.kallsyms2.o 是用 vmlinux.o + 与 .tmp_vmlinux.kallsyms2.o 相同布局的 .tmp_vmlinux.kallsyms1.o 链接的，因此符号偏移不再有变化，因此 .tmp_vmlinux.kallsyms2.o 就包含了所有正确的符号偏移（按照注释，实际上还可能做第三次 pass）
最终生成 vmlinux ： ld vmlinux.o .tmp_vmlinux.kallsyms2.o -> vmlinux ，由于布局相同，链接之后不影响符号偏移，应该得到具有正确 kallsyms 且符号偏移不变的 vmlinux 。脚本会生成 System.map ，并与 tmp_vmlinux.kallsyms2.map 比较，结果应当完全一致。

由于 kallsyms 生成过程要确保两次链接得到名字相同的符号表，因此 kallsyms 不含自己的符号（kallsyms_names 等）

解析

现在给定一个编译时打开了 kallsyms 的 kernel image ，如何确定它的 kallsyms 结构的位置？

我们知道 kernel image 是由 objcopy 生成的 binary 只包含内核的可加载段部分，不含 elf 头及符号表等结构，但是 kallsyms 是位于内核的可加载段中的(.rodata)，因此必然有办法能从 binaray 搜索到 kallsyms 结构的位置，这就要利用这些结构的特征进行搜索，下面我们以 vmlinux-to-elf 的 KallsymsFinder 为例分析如何搜索这些结构。

vmlinux-to-elf KallsymsFinder

下面参考 vmlinux-to-elf kallsyms.py

class KallsymsFinder:
    def __init__(
        self,
        kernel_img: bytes,
        bit_size: int = None,
        override_relative_base: bool = False,
        base_address: int = None,
        # extra_info: bool = False,
    ):
        self.override_relative_base = override_relative_base
        self.kernel_img = kernel_img

        # -

        self.find_linux_kernel_version()

        if bit_size:
            if bit_size not in (64, 32):
                exit(
                    '[!] Please specify a register bit size of either 32 or 64 bits'
                )
            else:
                self.is_64_bits = bit_size == 64

        self.guess_architecture()

        #self.extract_db_information()

        if self.is_64_bits:
            self.find_elf64_rela(base_address)
            self.apply_elf64_rela()

        # -

        try:
            self.find_kallsyms_token_table()
            self.find_kallsyms_token_index()
            self.uncompressed_kallsyms = False

        except KallsymsNotFoundException as first_error:  # Maybe an OpenWRT kernel with an uncompressed kallsyms
            try:
                self.find_kallsyms_names_uncompressed()
                self.find_kallsyms_markers_uncompressed()
                self.uncompressed_kallsyms = True

            except KallsymsNotFoundException:
                raise first_error

        else:
            self.find_kallsyms_markers()
            self.find_kallsyms_names()

        self.find_kallsyms_num_syms()
        self.find_kallsyms_addresses_or_symbols()

        # -

        self.parse_symbol_table()

确定 kallsyms_token_table

根据上述构建方式，一个常见的方式是确定 kallsyms_token_table ，这是因为内核符号往往包含了 0-9, a-z 和 A-Z ，因此在单词编码表中，它们都在原位，且在 kallsyms_token_table 中应该存在 0\01\02\0...9\0 / a\0b\0c\0...z\0 这样的子串。以使用 0-9 为例，通过搜索即可确定 kallsyms_token_table 中 0 字符的偏移（），接下来往前数 ord('0') 个 .asciz （零结尾字符串）即可找到 kallsyms_token_table 的开头，当然需要做一些校验，如是否对齐、中间遇到的字符串是否为 ascii 、前方是否还有 asciz 等。

def find_kallsyms_token_table(self):
    """
    kallsyms_token_table is an array of 256 variable length null-
    terminated string fragments. Positions which correspond to
    an ASCII character which is used in at least one symbol
    contain the corresponing character (1), other position contain
    a string fragment chosen by the compression algorithm (2).

    Hence, characters [0-9] and [a-z] are always present at their
    respective positions, but ":" (which comes after "9") never does.

    (1) See "insert_real_symbols_in_table" of "scripts/kallsyms.c"
    (2) See "optimize_result" of "scripts/kallsyms.c"
    """

    position = 0

    candidates_offsets = []  # offsets at which sequence_to_find was found
    candidates_offsets_followed_with_ascii = []  # variant with an higher certainty

    sequence_to_find = b''.join(
        b'%c\0' % i for i in range(ord('0'), ord('9') + 1)
    )

    sequences_to_avoid = [b':\0', b'\0\0', b'\0\1', b'\0\2', b'ASCII\0']

    while True:
        position = self.kernel_img.find(sequence_to_find, position + 1)
        if position == -1:
            break

        for seq in sequences_to_avoid:
            pos = position + len(sequence_to_find)
            if self.kernel_img[pos : pos + len(seq)] == seq:
                break
        else:
            candidates_offsets.append(position)

            if self.kernel_img[pos : pos + 1].isalnum():
                candidates_offsets_followed_with_ascii.append(position)

    if len(candidates_offsets) != 1:
        if len(candidates_offsets_followed_with_ascii) == 1:
            candidates_offsets = candidates_offsets_followed_with_ascii
        elif len(candidates_offsets) == 0:
            raise KallsymsNotFoundException(
                '%d candidates for kallsyms_token_table in kernel image'
                % len(candidates_offsets)
            )
        else:
            raise ValueError(
                '%d candidates for kallsyms_token_table in kernel image'
                % len(candidates_offsets)
            )

    position = candidates_offsets[0]

    # Get back to the beginning of the table

    current_index_in_array = ord('0')

    position -= 1
    assert position >= 0 and self.kernel_img[position] == 0

    for tokens_backwards in range(current_index_in_array):
        for chars_in_token_backwards in range(50):
            position -= 1
            assert position >= 0

            # (caveat: we may overlap on "kallsyms_markers" for the
            # last entry, so also check for high-range characters)

            if self.kernel_img[position] == 0 or self.kernel_img[
                position
            ] > ord('z'):
                break

            if chars_in_token_backwards >= 50 - 1:
                raise ValueError(
                    'This structure is not a kallsyms_token_table'
                )

    position += 1
    position += -position % 4

    self.kallsyms_token_table__offset = position

    logging.info(
        '[+] Found kallsyms_token_table at file offset 0x%08x'
        % self.kallsyms_token_table__offset
    )

确定 kallsyms_token_index

找到 kallsyms_token_table 开头之后就是往后数 256 个 .asciz 来确定 kallsyms_token_index （注意对齐），然后还要根据 kallsyms_token_index 的内容与 kallsyms_token_table 每个单词的位置做比较

在 kallsyms finder 中，这里尝试了不同的 endian 和不同的 index 长度（2，4，8），因此可以同时确定内核的 endian

def find_kallsyms_token_index(self):
    # Get to the end of the kallsyms_token_table

    position = self.kallsyms_token_table__offset

    all_token_offsets = []

    position -= 1

    for tokens_forward in range(256):
        position += 1

        all_token_offsets.append(
            position - self.kallsyms_token_table__offset
        )

        for chars_in_token_forward in range(50):
            position += 1

            if self.kernel_img[position] == 0:
                break

            if chars_in_token_forward >= 50 - 1:
                raise ValueError(
                    'This structure is not a kallsyms_token_table'
                )

    # Find kallsyms_token_index through the offset through searching
    # the reconstructed structure, also use this to guess endianness

    MAX_ALIGNMENT = 256
    KALLSYMS_TOKEN_INDEX__SIZE = 256 * 2

    memory_to_search = bytes(
        self.kernel_img[
            position : position
            + KALLSYMS_TOKEN_INDEX__SIZE
            + MAX_ALIGNMENT
        ]
    )

    little_endian_offsets = pack(
        '<%dH' % len(all_token_offsets), *all_token_offsets
    )
    big_endian_offsets = pack(
        '>%dH' % len(all_token_offsets), *all_token_offsets
    )

    found_position_for_le_value = memory_to_search.find(
        little_endian_offsets
    )
    found_position_for_be_value = memory_to_search.find(big_endian_offsets)

    if found_position_for_le_value == found_position_for_be_value == -1:
        raise ValueError('The value of kallsyms_token_index was not found')

    elif found_position_for_le_value > found_position_for_be_value:
        self.is_big_endian = False

        self.kallsyms_token_index__offset = (
            position + found_position_for_le_value
        )

    elif found_position_for_be_value > found_position_for_le_value:
        self.is_big_endian = True

        self.kallsyms_token_index__offset = (
            position + found_position_for_be_value
        )

    self.kallsyms_token_index_end__offset = (
        self.kallsyms_token_index__offset + len(little_endian_offsets)
    )

    logging.info(
        '[+] Found kallsyms_token_index at file offset 0x%08x'
        % self.kallsyms_token_index__offset
    )

确定 kallsyms_markers

接下来确定 kallsyms_markers ，这个在 kallsyms_token_table 之前，且是变长的距离。确定这个结构的方法就是往前找 0 ，因为 0 是列表里的第一个值，找到可能的位置之后，进行简单验证，取头 4 个元素，检查值的差距是否在可接受范围，具体参看下面的注释。

def find_kallsyms_markers(self):
    """
    kallsyms_markers contains one offset in kallsyms_names for each
    1 in 256 entries of it. Offsets are stored as either ".long"
    (a Gnu AS type that corresponds for example to 4 bytes in
    x86_64) since kernel v4.20, either as the maximum register
    byte of the system (the C "long" type) on older kernels.
    Remember about the size of this field for later.
    The first index is always 0, it is sorted, and it is aligned.
    """

    # Try possible sizes for the table element (long type)
    for table_element_size in (8, 4, 2):
        position = self.kallsyms_token_table__offset
        endianness_marker = '>' if self.is_big_endian else '<'
        long_size_marker = {2: 'H', 4: 'I', 8: 'Q'}[table_element_size]

        # Search for start of kallsyms_markers given first element is 0 and it is sorted
        for _ in range(32):
            # 往前尝试 32 次搜索 kallsyms_markers 的第 0 个元素
            position = self.kernel_img.rfind(
                b'\x00' * table_element_size, 0, position
            )
            # 第 0 个和第 1 个元素有可能是 00 00 00 00, 00 00 xx xx ，这样查找到的不是第 0 个元素，所以向下对齐
            position -= position % table_element_size
            # 只检查头 4 个元素，也就是 4*256=1024 个符号，一般 kallsyms 大小远大于此
            entries = unpack_from(
                endianness_marker + '4' + long_size_marker,
                self.kernel_img,
                position,
            )
            # 由于对齐了，再次检查可能的第 0 个元素是否为 0
            if entries[0] != 0:
                continue

            for i in range(1, len(entries)):
                # kallsyms_names entries are at least 2 bytes and at most 0x3FFF bytes long
                # 由于这里是每 256 个元素的偏移，因此两个元素之间满足一定的大小关系
                # 这里假设符号长度至少为 2 ，因此相隔 256 个符号偏移至少要相差 0x200
                # 又假设符号长度不超过 0x3fff （这是 ULEB128 两个字符能表示的最大限度）
                # 因此 256 个符号偏移不能超过 0x4000 * 0x100 = 0x400000 
                # （这里不知道为啥少了个 0 ，但是一般符号不会太长所以也合理）
                # 即： entries[i] - entries[i - 1] 在 (0x200, 0x40000] 之间才符合条件，此处隐含了 > 0
                if (
                    entries[i - 1] + 0x200 >= entries[i]
                    or entries[i - 1] + 0x40000 < entries[i]
                ):
                    break
            else:
                logging.info(
                    '[+] Found kallsyms_markers at file offset 0x%08x'
                    % position
                )
                self.kallsyms_markers__offset = position
                self.offset_table_element_size = table_element_size
                return
    raise ValueError('Could not find kallsyms_markers')

初步查找 kallsyms_names

接下来找 kallsyms_names ，在 kallsyms_markers 之前，首先估算 kallsyms_markers 大小，由于 kallsyms_marker 和 kallsyms_token_table 偏移已知，中间还夹着个 kallsyms_seqs_of_names ，所以只能大致推算大小的上界，然而 kallsyms_seqs_of_names 大小是 kallsyms_seqs_of_name 的 256/4*3 倍，因此这里做了大小的限制（3000*256 允许 768000 个kallsyms ，相比 10w 还是更大的），之后继续用上面的比较两个元素之间差距的排除法来确定 kallsyms_markers 的结束位置，最后当前位置减去 kallsyms_markers 最后一个元素（也就是第 ((symcount+255)//256*256) 个符号名字的偏移）再对齐，作为 kallsyms_names 的大致搜索位置（真正的位置可能在更前面），接下来要进一步搜索。

def find_kallsyms_names(self):
    position = self.kallsyms_markers__offset

    # Approximate the position of kallsyms_names based on the
    # last entry of "kallsyms_markers" - we'll determine the
    # precise position in the next method

    endianness_marker = '>' if self.is_big_endian else '<'

    long_size_marker = {2: 'H', 4: 'I', 8: 'Q'}[
        self.offset_table_element_size
    ]

    # Estimate kallsyms_markers length. Limit to 3000 for kernels with kallsyms_seqs_of_names
    num_of_kallsyms_markers_entries = (
        self.kallsyms_token_table__offset - self.kallsyms_markers__offset
    ) // self.offset_table_element_size

    kallsyms_markers_entries = unpack_from(
        endianness_marker
        + str(min(3000, num_of_kallsyms_markers_entries))
        + long_size_marker,
        self.kernel_img,
        self.kallsyms_markers__offset,
    )

    for i in range(1, len(kallsyms_markers_entries)):
        curr = kallsyms_markers_entries[i]
        last = kallsyms_markers_entries[i - 1]
        if last + 0x200 >= curr or last + 0x40000 < curr:
            kallsyms_markers_entries = kallsyms_markers_entries[:i]
            break

    last_kallsyms_markers_entry = list(
        filter(None, kallsyms_markers_entries)
    )[-1]

    position -= last_kallsyms_markers_entry

    position += -position % self.offset_table_element_size

    assert position > 0

    self.kallsyms_names__offset = position
    # Guessing continues in the function below (in order to handle the
    # absence of padding)

精确查找 kallsyms_names 和 kallsyms_num_syms

这里仍然利用 kallsyms_names 的特征：ULEB128 长度前缀字符串，从 kallsyms_markers 开始往前推算每个符号名字的位置，找到可能的第一个符号，并利用它前面的 kallsyms_num_syms 来验证，从而确定这两个结构。前面的工作只是减小了搜索范围。

# Symbol types are the same as exposed by "man nm"
class KallsymsSymbolType(Enum):
    # Seen in actual kernels
    ABSOLUTE = 'A'
    BSS = 'B'
    DATA = 'D'
    RODATA = 'R'
    TEXT = 'T'
    WEAK_OBJECT_WITH_DEFAULT = 'V'
    WEAK_SYMBOL_WITH_DEFAULT = 'W'

    # Seen on nm's manpage
    SMALL_DATA = 'G'
    INDIRECT_FUNCTION = 'I'
    DEBUGGING = 'N'
    STACK_UNWIND = 'P'
    COMMON = 'C'
    SMALL_BSS = 'S'
    UNDEFINED = 'U'
    UNIQUE_GLOBAL = 'u'
    WEAK_OBJECT = 'v'
    WEAK_SYMBOL = 'w'
    STABS_DEBUG = '-'
    UNKNOWN = '?'

    def find_kallsyms_num_syms(self):
        needle = -1

        token_table = self.get_token_table()
        possible_symbol_types = [i.value for i in KallsymsSymbolType]

        dp = []

        while needle == -1:
            position = self.kallsyms_names__offset

            # Check whether this looks like the correct symbol
            # table, first depending on the beginning of the
            # first symbol (as this is where an uncertain gap
            # of 4 padding bytes may be present depending on
            # versions or builds), then thorough the whole
            # table. Raise an issue further in the code (in
            # another function) if an exotic kind of symbol is
            # found somewhere else than in the first entry.

            first_token_index_of_first_name = self.kernel_img[position + 1]
            first_token_of_first_name = token_table[
                first_token_index_of_first_name
            ]

            # 找到一个可能是 kallsyms_names 开头的位置
            # 要满足第一个字符是 type 的可能字符
            # 这里似乎只假定这个符号长度是 <= 0x7f
            if (
                not (
                    first_token_of_first_name[0].lower() in 'uvw'
                    and first_token_of_first_name[0] in possible_symbol_types
                )
                and first_token_of_first_name[0].upper()
                not in possible_symbol_types
            ):
                # 不满足则往前进 4 ，因为对齐是 4
                self.kallsyms_names__offset -= 4
                if self.kallsyms_names__offset < 0:
                    raise ValueError('Could not find kallsyms_names')
                continue

            # Each entry in the symbol table starts with a u8 size followed by the contents.
            # The table ends with an entry of size 0, and must lie before kallsyms_markers.
            # This for loop uses a bottom-up DP approach to calculate the numbers of symbols without recalculations.
            # dp[i] is the length of the symbol table given a starting position of "kallsyms_markers - i"
            # If the table position is invalid, i.e. it reaches out of bounds, the length is marked as -1.
            # The loop ends with the number of symbols for the current position in the last entry of dp.


            # dp[i] 表示从 kallsyms_markers__offset-i 开始，如果可能为一个符号，则是倒数第几个符号，否则为 -1
            # kallsyms_names 结束到 kallsyms_markers 开始中间可能有 0 作为 padding ，
            # 每一个 0 都可以看作一个 uleb128 长度前缀的符号，也就是可以看作倒数第 0 个符号
            # 是否可能作为符号，取决于当前位置作为符号，计算长度之后，没有超过 kallsyms_markers__offset
            # 如果没有超过，则 next_i 就是下一个符号的位置，dp[next_i] 是下一个符号的倒数序号
            # 如果倒数序号不为 -1 就可以将当前位置看作是一个符号，其倒数序号为 dp[next_i] + 1
            # 从 0 开始是因为 kallsyms_markers 这个位置必然是 0 ，如果 kallsyms_names 长度刚好对齐，那么至少要有一个
            # 0 来作为起始位置
            # 这个 dp 数组在循环中是复用的
            for i in range(
                len(dp), self.kallsyms_markers__offset - position + 1
            ):
                curr = self.kernel_img[self.kallsyms_markers__offset - i]
                if curr & 0x80:
                    # "Big" symbol
                    symbol_size = (
                        curr & 0x7F
                        | (
                            self.kernel_img[
                                self.kallsyms_markers__offset - i + 1
                            ]
                            << 7
                        )
                    ) + 2
                else:
                    symbol_size = curr + 1
                next_i = i - symbol_size
                if curr == 0:  # Last entry of the symbol table
                    dp.append(0 if i <= 256 else -1)
                elif (
                    next_i < 0 or dp[next_i] == -1
                ):  # If table would exceed kallsyms_markers, mark as invalid
                    dp.append(-1)
                else:
                    dp.append(dp[next_i] + 1)
            num_symbols = dp[-1]

            if num_symbols < 256:
                self.kallsyms_names__offset -= 4
                if self.kallsyms_names__offset < 0:
                    raise ValueError('Could not find kallsyms_names')
                continue

            # 如果符号数量足够多，就可以尝试前一个 4 字节是否恰好为这个符号个数
            # 由于可能有对齐，因此不是恰好前一个 4 字节

            self.num_symbols = num_symbols

            # Find the long or PTR (it should be the same size as a kallsyms_marker
            # entry) encoding the number of symbols right before kallsyms_names

            endianness_marker = '>' if self.is_big_endian else '<'

            long_size_marker = {2: 'H', 4: 'I', 8: 'Q'}[
                self.offset_table_element_size
            ]

            MAX_ALIGNMENT = 256

            encoded_num_symbols = pack(
                endianness_marker + long_size_marker, num_symbols
            )

            needle = self.kernel_img.rfind(
                encoded_num_symbols,
                max(0, self.kallsyms_names__offset - MAX_ALIGNMENT - 20),
                self.kallsyms_names__offset,
            )

            if (
                needle == -1
            ):  # There may be no padding between kallsyms_names and kallsyms_num_syms, if the alignment is already correct: in this case: try other offsets for "kallsyms_names"
                self.kallsyms_names__offset -= 4
                if self.kallsyms_names__offset < 0:
                    raise ValueError('Could not find kallsyms_names')

            # 到这里就找到了 kallsyms_names 的准确偏移和 kallsyms_num_syms 的偏移和值

        logging.info(
            '[+] Found kallsyms_names at file offset 0x%08x (%d symbols)'
            % (self.kallsyms_names__offset, self.num_symbols)
        )

        position = needle

        self.kallsyms_num_syms__offset = position

        logging.info(
            '[+] Found kallsyms_num_syms at file offset 0x%08x' % position
        )

确定 kallsyms_address 或者 kallsyms_offsets + kallsyms_relative_base 的位置

由于这些结构在 kallsyms_num_syms 的上面（6.4 后的内核这些结构在 kallsyms_token_index 下面），而符号个数已知，因此只要利用大小和对齐性质即可找到他们的偏移，接下来只需要做一些验证，检查地址是否合理即可， vmlinux-to-elf 这部分代码较为繁琐且主要是处理不同版本、架构或编译选项带来的各种情况，故不贴出。