博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
期末大作业
阅读量:6569 次
发布时间:2019-06-24

本文共 9330 字,大约阅读时间需要 31 分钟。

1. 读取数据集

2. 训练集与测试集划分

3. 线性回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。

4. 多项式回归模型:建立13个变量与房价之间的预测模型,并检测模型好坏。

5. 比较线性模型与非线性模型的性能,并说明原因。

# 多元线性回归模型from sklearn.datasets import load_bostonfrom sklearn.model_selection import train_test_split# 波士顿房价数据集data = load_boston()# 划分数据集x_train, x_test, y_train, y_test = train_test_split(data.data,data.target,test_size=0.3)# 建立线性回归模型from sklearn.linear_model import LinearRegressionbos_lg = LinearRegression()bos_lg.fit(x_train,y_train)print('系数',bos_lg.coef_,"\n截距",bos_lg.intercept_)# 检测模型好坏from sklearn.metrics import regressiony_predict = bos_lg.predict(x_test)# 计算模型的预测指标print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict))print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict))# 打印模型的分数print("模型的分数:",bos_lg.score(x_test, y_test))print('=================\n')# 多元多项式回归模型# 多项式化from sklearn.preprocessing import PolynomialFeaturespoly2 = PolynomialFeatures(degree=2)x_poly_train = poly2.fit_transform(x_train)x_poly_test = poly2.transform(x_test)# 建立模型bos_lgp = LinearRegression()bos_lgp.fit(x_poly_train, y_train)# 预测y_predict2 = bos_lgp.predict(x_poly_test)# 检测模型好坏# 计算模型的预测指标print("预测的均方误差:", regression.mean_squared_error(y_test,y_predict2))print("预测的平均绝对误差:", regression.mean_absolute_error(y_test,y_predict2))# 打印模型的分数print("模型的分数:",bos_lgp.score(x_poly_test, y_test))

 

二、中文文本分类

按学号未位下载相应数据集。

147:财经、彩票、房产、股票、

258:家居、教育、科技、社会、时尚、

0369:时政、体育、星座、游戏、娱乐

分别建立中文文本分类模型,实现对文本的分类。基本步骤如下:

1.各种获取文件,写文件

2.除去噪声,如:格式转换,去掉符号,整体规范化

3.遍历每个个文件夹下的每个文本文件。

4.使用jieba分词将中文文本切割。

中文分词就是将一句话拆分为各个词语,因为中文分词在不同的语境中歧义较大,所以分词极其重要。

可以用jieba.add_word('word')增加词,用jieba.load_userdict('wordDict.txt')导入词库。

维护自定义词库

5.去掉停用词。

维护停用词表

6.对处理之后的文本开始用TF-IDF算法进行单词权值的计算

7.贝叶斯预测种类

8.模型评价

9.新文本类别预测

模型

import jiebafrom sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.model_selection import train_test_splitfrom sklearn.naive_bayes import  MultinomialNBfrom sklearn.linear_model import LinearRegressionfrom myThread import my_mainimport collectionsimport matplotlib.pyplot as pltfrom pylab import mplmpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体'''def get_data():    data = []    stopword = get_stopword()    label = []    for i in range(644580,644602):#股票        file = "d:/data/147/temp/股票/"+str(i)+".txt"        with open(file,'r',encoding='utf-8') as f:            news = f.read()        this_news = ""        for ci in jieba.cut(news):            if ci not in stopword:                this_news = this_news+ci+" "        data.append(this_news)        label.append('股票')    for i in range(264410,264429):#房产        file = "d:/data/147/temp/房产/" + str(i) + ".txt"        with open(file, 'r', encoding='utf-8') as f:            news = f.read()        this_news = ""        for ci in jieba.cut(news):            if ci not in stopword:                this_news = this_news + ci + " "        data.append(this_news)        label.append('房产')    for i in range(256822,256843):#彩票        file = "d:/data/147/temp/彩票/" + str(i) + ".txt"        with open(file, 'r', encoding='utf-8') as f:            news = f.read()        this_news = ""        for ci in jieba.cut(news):            if ci not in stopword:                this_news = this_news + ci + " "        data.append(this_news)        label.append('彩票')    for i in range(798977,798999):#财经        file = "d:/data/147/temp/财经/" + str(i) + ".txt"        with open(file, 'r', encoding='utf-8') as f:            news = f.read()        this_news = ""        for ci in jieba.cut(news):            if ci not in stopword:                this_news = this_news + ci + " "        data.append(this_news)        label.append('财经')    return data,labeldef get_stopword():    #加载停用词表    stopwords = [line.strip() for line in open('stopword.txt', 'r',encoding='utf-8').readlines()]    stopwords.append('\u3000')    stopwords.append('\n')    return stopwords'''def xiangliang(x_train,x_test):    # 向量化    from sklearn.feature_extraction.text import TfidfVectorizer    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), strip_accents='unicode')  # ,norm='12'    x_train = vectorizer.fit_transform(x_train)    x_test = vectorizer.transform(x_test)    return x_train, x_test, vectorizerdef beiNB(x_train, y_train,x_test):    # 朴素贝叶斯分类器    clf = MultinomialNB().fit(x_train, y_train)    y_nb_pred = clf.predict(x_test)    return y_nb_pred,clfdef result(vectorizer,clf):    # 分类结果    from sklearn.metrics import confusion_matrix    from sklearn.metrics import classification_report    print('====================shape')    print(y_nb_pred.shape, y_nb_pred)    print('nb_confusion_matrix:')    cm = confusion_matrix(y_test, y_nb_pred)    print(cm)    cr = classification_report(y_test, y_nb_pred)    print(cr)    feature_names = vectorizer.get_feature_names()    coefs = clf.coef_    intercept = clf.intercept_    coefs_with_fns = sorted(zip(coefs[0], feature_names))    n = 10    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])    print('=================coef')    for (coef_1, fn_1), (coef_2, fn_2) in top:        print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))if __name__ == '__main__':    data,label = my_main()    print(len(data))    print(label,len(label))    x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0,stratify=label)    X_train, X_test, vectorizer = xiangliang(x_train, x_test)    y_nb_pred, clf = beiNB(X_train, y_train, X_test)    result(vectorizer, clf)    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号    # 统计测试集和预测集的各类新闻个数    testCount = collections.Counter(y_test)    predCount = collections.Counter(y_nb_pred)    print('实际:', testCount, '\n', '预测', predCount)    # 建立标签列表,实际结果列表,预测结果列表,    nameList = list(testCount.keys())    testList = list(testCount.values())    predictList = list(predCount.values())    x = list(range(len(nameList)))    print("新闻类别:", nameList, '\n', "实际:", testList, '\n', "预测:", predictList)    # 画图    plt.figure(figsize=(7, 5))    total_width, n = 0.6, 2    width = total_width / n    plt.bar(x, testList, width=width, label='实际', fc='g')    for i in range(len(x)):        x[i] = x[i] + width    plt.bar(x, predictList, width=width, label='预测', tick_label=nameList, fc='b')    plt.grid()    plt.title('实际和预测对比图', fontsize=17)    plt.xlabel('新闻类别', fontsize=17)    plt.ylabel('频数', fontsize=17)    plt.legend(fontsize=17)    plt.tick_params(labelsize=15)    plt.show()

 

mythread.py

import jieba import threading#from nlt_cut import get_stopword,xiangliang,beiNB,resultfrom sklearn.model_selection import train_test_splitimport numpy as npclass myThread(threading.Thread):    '''读取文件的线程类'''    def __init__(self,threadID,name,start_number,end_number):        threading.Thread.__init__(self)        self.threadID = threadID        self.name = name        self.start_number = start_number        self.end_number = end_number    def run(self):        print('读取文件开始:'+self.name)        read_txt(self.name,self.start_number,self.end_number)        print('读取文件结束'+self.name)def get_stopword():    '''加载停用词表'''    stopwords = [line.strip() for line in open('stopword.txt', 'r',encoding='utf-8').readlines()]    stopwords.append('\u3000')    stopwords.append('\n')    return stopwordsdata = []label = []stopword = get_stopword()def read_txt(threadName,start_number,end_number):    for i in range(start_number,end_number):        file = "d:/data/147//147/"+threadName+"/"+str(i)+".txt"        with open(file,'r',encoding='utf-8') as f:            news = f.read()        this_news = ""        for ci in jieba.cut(news):            if ci not in stopword:                this_news = this_news+ci+" "        data.append(this_news)        label.append(threadName)def get_data():    return data,labeldef my_new_thread():    '''thread1 = myThread(1, '财经', 798977, 836075)    thread2 = myThread(2, '彩票', 256822, 264410)    thread3 = myThread(3, '房产', 264410, 284460)    thread4 = myThread(4, '股票', 644579, 798977)'''    thread1 = myThread(1, '财经', 798977, 810000)    thread2 = myThread(2, '彩票', 256822, 260000)    thread3 = myThread(3, '房产', 264410, 270000)    thread4 = myThread(4, '股票', 644579, 700000)    thread5 = myThread(5, '财经', 810000, 836075)    thread6 = myThread(6, '彩票', 260000, 264410)    thread7 = myThread(7, '房产', 270000, 284460)    thread8 = myThread(8, '股票', 700000, 750000)    thread9 = myThread(9, '股票', 750000, 798977)    thread1.start()    thread2.start()    thread3.start()    thread4.start()    thread5.start()    thread6.start()    thread7.start()    thread8.start()    thread9.start()    threads = [thread1,thread2,thread3,thread4,thread6,thread5,thread7,thread8,thread9]    for t in threads:        t.join()    print('退出线程')def my_main():    my_new_thread()    data, label = get_data()    return data,label'''if __name__ == '__main__':    my_new_thread()    data, label = get_data()    np.save("d:/data.npy",np.array(data))    np.save("d:/label.npy", np.array(label))    print("=======================")    x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=0, stratify=label)    X_train, X_test, vectorizer = xiangliang(x_train, x_test)    y_nb_pred, clf = beiNB(X_train, y_train, X_test)     result(vectorizer, clf,y_nb_pred)'''

转载于:https://www.cnblogs.com/smallgrass/p/10165168.html

你可能感兴趣的文章
通过SQL Server 2008 访问MySQL(转)
查看>>
B0BO TFS 安装指南(转载)
查看>>
gulp常用命令
查看>>
TCP(Socket基础编程)
查看>>
RowSet的使用
查看>>
表单提交中的input、button、submit的区别
查看>>
thinkphp3.2.3定时任务 不能获取本模块config, 不能获取本模块的其他配置
查看>>
每日一记--cookie
查看>>
约瑟夫环
查看>>
S5:桥接模式 Bridge
查看>>
线程池-Executors
查看>>
WPF and Silverlight 学习笔记(十二):WPF Panel内容模型、Decorator内容模型及其他...
查看>>
Codeforces 414B
查看>>
LeetCode --- Pascal's Triangle II
查看>>
java对象引用,对象赋值
查看>>
mate标签
查看>>
FLUSH TABLES WITH READ LOCK 和 LOCK TABLES比较
查看>>
MYSQL 事务测试
查看>>
mysql插入中文报错的问题
查看>>
Web Socket 长连接
查看>>