基于字符串匹配的病毒序列检测

Eddy Lv2

本程序实现了基于字符串匹配的病毒序列检测功能。
主体由两大字符串匹配算法的实现函数、自制的substr函数,以及KMP中的get_next函数组成。

1
2
3
4
5
6
7
int substr(char input[], int start, int length, char output[]) {
if (start < 0 start > strlen(input) length < 0 start + length - 1 > strlen(input))
return 0;
for (int i = start; i < start + length; i++)
output[i - start] = input[i];
return 1;
}

此外,在主函数中采用了创新性的整体结构,首先建立一个字符串outputBuffer充当输出缓冲区,在显示必要的输入提示之后,通过永真的while循环不停读取用户输入;当读取到双z时,输出缓冲区并退出(从Ctrl+Z获得灵感)。接下来,对于每一组S和T,遵循课上所说,利用sprintf将模式串复制两次后采用for循环每次提取新T串的前n位用于匹配(n为复制前T的长度)取得NextVal函数值,并调用两大算法进行匹配。此外还采用了逻辑短路机制,当前几次匹配中有成功者,后续就会不再比较。While循环体末尾采用了sprintf函数,将本次运行的结果以特定形式输出至outputBuffer。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
int count = 1, t_length = 0;
char outputBuffer[MAXSIZE] = {};
cout << endl << "General Instructions:" << endl;
cout << "Now please enter dna pairs, first is of the virus, the second is person's." << endl;
cout << "To Interrupt inputting, please enter 'z z' on a new line and then press enter." << endl << flush;
while (true) {
char s[MAXSIZE] = {}, t[MAXSIZE] = {}, t_part[MAXSIZE] = {};
//Processing one pair of DNA in one circulation
cin >> t;
cin >> s;
if (s[0] == 'z' && t[0] == 'z') {
//User Ready to view results
cout << outputBuffer << endl;
break;
}
t_length = strlen(t);
//Double the pattern string
sprintf(t, "%s%s", t, t);
bool BFResult = false, KMPResult = false;
for (int i = 0; i < t_length; i++) {
int nextv[MAXSIZE];
get_next(t, nextv);
//get each {t_length}characters of T string
substr(t, i, t_length, t_part);
//Used logical short-circuit, to avoid useless computing since first success in t_part
BFResult = BFResult (Index_BF(s, t_part) != -1);
KMPResult = KMPResult (Index_KMP(s, t_part, nextv) != -1);
}
sprintf(outputBuffer, "%s -The %d(th) , By BF:{%s}, By KMP:{%s}\n", outputBuffer, count++,
BFResult ? "Yes" : "No ", KMPResult ? "Yes" : "No ");
}

运行截图如下,其中,

第一行为考验哪个为模式串而设。
第二行为明显不匹配的字串。
第三行为明显匹配的。
第四行为考验环状DNA匹配功能是否正常工作。


完整代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <iostream>
using namespace std;
#define MAXSIZE 255
int substr(char input[], int start, int length, char output[]) {
if (start < 0 start > strlen(input) length < 0 start + length - 1 > strlen(input))
return 0;
for (int i = start; i < start + length; i++)
output[i - start] = input[i];
return 1;
}

int Index_BF(char S[], char T[]) {
int i = 0, j = 0;
int lenS = (int) strlen(S), lenT = (int) strlen(T);
while (i < lenS && j < lenT) {
if (S[i] == T[j]) {
i++, j++;
} else {
i = i - j + 1;
j = 0;
}
}
return (j >= lenT) ? i - lenT : -1;
}
void get_next(char t[], int nextv[]) {
// t2 is used to change the pattern string to start from index 1, not 0.
char t2[MAXSIZE] = {0};
for (int m = 0;; m++) {
t2[m + 1] = t[m];
if (t[m] == 0)
break;
}
int i = 1, j = 0;
nextv[1] = 0;
while (i < strlen(t)) {
if (j == 0 t2[i] == t2[j]) {
i++, j++;
nextv[i] = j;
} else {
j = nextv[j];
}
}
}
int Index_KMP(char S[], char T[], int nextv[]) {
int i = 1, j = 1;
char s2[MAXSIZE] = {0}, t2[MAXSIZE] = {0};
for (int m = 0;; m++) {
t2[m + 1] = T[m], s2[m + 1] = S[m];
if (T[m] == 0) break;
}
int lenS = (int) strlen(S), lenT = (int) strlen(T);
while (i <= lenS && j <= lenT) {
if (j == 0 s2[i] == t2[j]) {
i++, j++;
} else {
j = nextv[j];
}
}
return (j > lenT) ? i - lenT : -1;
}
int main() {
int count = 1, t_length = 0;
char outputBuffer[MAXSIZE] = {};
cout << endl << "General Instructions:" << endl;
cout << "Now please enter dna pairs, first is of the virus, the second is person's." << endl;
cout << "To Interrupt inputting, please enter 'z z' on a new line and then press enter." << endl << flush;
while (true) {
char s[MAXSIZE] = {}, t[MAXSIZE] = {}, t_part[MAXSIZE] = {};
//Processing one pair of DNA in one circulation
cin >> t;
cin >> s;
if (s[0] == 'z' && t[0] == 'z') {
//User Ready to view results
cout << outputBuffer << endl;
break;
}
t_length = strlen(t);
//Double the pattern string
sprintf(t, "%s%s", t, t);
bool BFResult = false, KMPResult = false;
for (int i = 0; i < t_length; i++) {
int nextv[MAXSIZE];
get_next(t, nextv);
//get each {t_length}characters of T string
substr(t, i, t_length, t_part);
//Used logical short-circuit, to avoid useless computing since first success in t_part
BFResult = BFResult (Index_BF(s, t_part) != -1);
KMPResult = KMPResult (Index_KMP(s, t_part, nextv) != -1);
}
sprintf(outputBuffer, "%s -The %d(th) , By BF:{%s}, By KMP:{%s}\n", outputBuffer, count++,
BFResult ? "Yes" : "No ", KMPResult ? "Yes" : "No ");
}
return 0;
}

Thanks for reading!

  • Title: 基于字符串匹配的病毒序列检测
  • Author: Eddy
  • Created at : 2022-12-13 21:27:00
  • Updated at : 2024-04-29 00:15:39
  • Link: https://blog.eddy.moe/2022/12/13/基于字符串匹配的病毒序列检测/
  • License: This work is licensed under CC BY-NC-SA 4.0.
Comments
On this page
基于字符串匹配的病毒序列检测