基于字符串匹配的病毒序列检测

本程序实现了基于字符串匹配的病毒序列检测功能。
主体由两大字符串匹配算法的实现函数、自制的substr函数，以及KMP中的get_next函数组成。

int substr(char input[], int start, int length, char output[]) {
    if (start < 0  start > strlen(input)  length < 0  start + length - 1 > strlen(input))
        return 0;
    for (int i = start; i < start + length; i++)
        output[i - start] = input[i];
    return 1;
}

此外，在主函数中采用了创新性的整体结构，首先建立一个字符串outputBuffer充当输出缓冲区，在显示必要的输入提示之后，通过永真的while循环不停读取用户输入；当读取到双z时，输出缓冲区并退出（从Ctrl+Z获得灵感）。接下来，对于每一组S和T，遵循课上所说，利用sprintf将模式串复制两次后采用for循环每次提取新T串的前n位用于匹配（n为复制前T的长度）取得NextVal函数值，并调用两大算法进行匹配。此外还采用了逻辑短路机制，当前几次匹配中有成功者，后续就会不再比较。While循环体末尾采用了sprintf函数，将本次运行的结果以特定形式输出至outputBuffer。

int count = 1, t_length = 0;
    char outputBuffer[MAXSIZE] = {};
    cout << endl << "General Instructions:" << endl;
    cout << "Now please enter dna pairs, first is of the virus, the second is person's." << endl;
    cout << "To Interrupt inputting, please enter 'z z' on a new line and then press enter." << endl << flush;
    while (true) {
        char s[MAXSIZE] = {}, t[MAXSIZE] = {}, t_part[MAXSIZE] = {};
        //Processing one pair of DNA in one circulation
        cin >> t;
        cin >> s;
        if (s[0] == 'z' && t[0] == 'z') {
            //User Ready to view results
            cout << outputBuffer << endl;
            break;
        }
        t_length = strlen(t);
        //Double the pattern string
        sprintf(t, "%s%s", t, t);
        bool BFResult = false, KMPResult = false;
        for (int i = 0; i < t_length; i++) {
            int nextv[MAXSIZE];
            get_next(t, nextv);
            //get each {t_length}characters of T string
            substr(t, i, t_length, t_part);
            //Used logical short-circuit, to avoid useless computing since first success in t_part
            BFResult = BFResult  (Index_BF(s, t_part) != -1);
            KMPResult = KMPResult  (Index_KMP(s, t_part, nextv) != -1);
        }
        sprintf(outputBuffer, "%s -The %d(th) , By BF:{%s}, By KMP:{%s}\n", outputBuffer, count++,
                BFResult ? "Yes" : "No ", KMPResult ? "Yes" : "No ");
    }

运行截图如下，其中，

第一行为考验哪个为模式串而设。
第二行为明显不匹配的字串。
第三行为明显匹配的。
第四行为考验环状DNA匹配功能是否正常工作。

完整代码如下：

#include <iostream>
using namespace std;
#define MAXSIZE 255
int substr(char input[], int start, int length, char output[]) {
    if (start < 0  start > strlen(input)  length < 0  start + length - 1 > strlen(input))
        return 0;
    for (int i = start; i < start + length; i++)
        output[i - start] = input[i];
    return 1;
}

int Index_BF(char S[], char T[]) {
    int i = 0, j = 0;
    int lenS = (int) strlen(S), lenT = (int) strlen(T);
    while (i < lenS && j < lenT) {
        if (S[i] == T[j]) {
            i++, j++;
        } else {
            i = i - j + 1;
            j = 0;
        }
    }
    return (j >= lenT) ? i - lenT : -1;
}
void get_next(char t[], int nextv[]) {
//    t2 is used to change the pattern string to start from index 1, not 0.
    char t2[MAXSIZE] = {0};
    for (int m = 0;; m++) {
        t2[m + 1] = t[m];
        if (t[m] == 0)
            break;
    }
    int i = 1, j = 0;
    nextv[1] = 0;
    while (i < strlen(t)) {
        if (j == 0  t2[i] == t2[j]) {
            i++, j++;
            nextv[i] = j;
        } else {
            j = nextv[j];
        }
    }
}
int Index_KMP(char S[], char T[], int nextv[]) {
    int i = 1, j = 1;
    char s2[MAXSIZE] = {0}, t2[MAXSIZE] = {0};
    for (int m = 0;; m++) {
        t2[m + 1] = T[m], s2[m + 1] = S[m];
        if (T[m] == 0) break;
    }
    int lenS = (int) strlen(S), lenT = (int) strlen(T);
    while (i <= lenS && j <= lenT) {
        if (j == 0  s2[i] == t2[j]) {
            i++, j++;
        } else {
            j = nextv[j];
        }
    }
    return (j > lenT) ? i - lenT : -1;
}
int main() {
    int count = 1, t_length = 0;
    char outputBuffer[MAXSIZE] = {};
    cout << endl << "General Instructions:" << endl;
    cout << "Now please enter dna pairs, first is of the virus, the second is person's." << endl;
    cout << "To Interrupt inputting, please enter 'z z' on a new line and then press enter." << endl << flush;
    while (true) {
        char s[MAXSIZE] = {}, t[MAXSIZE] = {}, t_part[MAXSIZE] = {};
        //Processing one pair of DNA in one circulation
        cin >> t;
        cin >> s;
        if (s[0] == 'z' && t[0] == 'z') {
            //User Ready to view results
            cout << outputBuffer << endl;
            break;
        }
        t_length = strlen(t);
        //Double the pattern string
        sprintf(t, "%s%s", t, t);
        bool BFResult = false, KMPResult = false;
        for (int i = 0; i < t_length; i++) {
            int nextv[MAXSIZE];
            get_next(t, nextv);
            //get each {t_length}characters of T string
            substr(t, i, t_length, t_part);
            //Used logical short-circuit, to avoid useless computing since first success in t_part
            BFResult = BFResult  (Index_BF(s, t_part) != -1);
            KMPResult = KMPResult  (Index_KMP(s, t_part, nextv) != -1);
        }
        sprintf(outputBuffer, "%s -The %d(th) , By BF:{%s}, By KMP:{%s}\n", outputBuffer, count++,
                BFResult ? "Yes" : "No ", KMPResult ? "Yes" : "No ");
    }
    return 0;
}

Thanks for reading!

Eddy Moments

基于字符串匹配的病毒序列检测