{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.manifold import TSNE\n", "import yaml\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import sys\n", "import os\n", "\n", "sys.path.append(os.path.abspath('..'))\n", "from prompt import PromptEngine" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/data/zhelonghuang/miniconda3/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "# from BCEmbedding import EmbeddingModel\n", "from langchain_community.embeddings import HuggingFaceEmbeddings\n", "sentences = ['python 是什么', '请介绍一下 python']\n", "# model = EmbeddingModel(model_name_or_path=\"maidalun1020/bce-embedding-base_v1\")\n", "model = HuggingFaceEmbeddings(model_name='maidalun1020/bce-embedding-base_v1')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "embeddings = model.embed_documents(sentences)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(['请问 property.json 如何配置?',\n", " '我的自动补全无法使用,是不是有bug?',\n", " '帮我上传一下这份数据',\n", " 'surface了解一下?',\n", " '大佬们,为啥我的digital ide启动之后所有功能都没启动捏?我配置了property文件,然后插件的vivado路经和modelsim路经都加上了',\n", " '这群要被chisel夺舍了吗',\n", " 'Metals一开直接报错',\n", " '话说digital-ide打开大的verilog卡死了',\n", " '请问一下,第一次点击对文件仿真可以出波形文件,再次点击的时候就会提示unknown module type了。是哪个配置没配置好?',\n", " '怎么调整是哪个版本的vivado来构建工程呢',\n", " '咱们这个插件win7的vscode是不是只能用很早之前的版本',\n", " '帮我将这份数据保存到服务器上',\n", " '他这个意思是 单个功耗很低 但是功耗低那肯定性能就寄 频率肯定不高 靠人多',\n", " '我平时写代码就喜欢喝茶',\n", " '感觉现在啥都在往AI靠',\n", " '请问你们自动对齐插件用的啥?',\n", " '不得不放一下我的咖啡笔记了',\n", " 'stm32有什么好玩的应用不',\n", " '别人设置的肯定有点不合适自己的',\n", " 'http://hehezhou.cn/register2024/AArch64-regindex.html',\n", " '因为他们py本领不是很强,需要这些东西辅助',\n", " '写C写多了,顺手在pycharm写了个main.c',\n", " '好流畅的にほんじんです',\n", " '有没有接触过UI开发的,想做一款寄存器管理的工具,想把界面做的好看一点',\n", " '现在嘉立创也在做FPGA了?',\n", " '大佬们,更新0.3.3之后,用iverilog仿真,testbench中还是例化模块出错:unknown module type,这是什么原因啊?',\n", " '查了一下记录,2017年买的静电容',\n", " '我小时候电脑刚买回来一星期就被我玩坏了',\n", " 'command not found: python',\n", " 'path top.v is not a hdlFile 请问报这个错误大概是啥原因啊',\n", " '咖啡喝不了,喝了胃不舒服',\n", " '关于波形显示的一些建议',\n", " '采用iverilog生成的VCD貌似无法解析仿真数据',\n", " '【v0.3.2】模块调用后netlist生成错误,且仿真报错',\n", " '网表优化与插入文档',\n", " '插件文档导出问题',\n", " '【v0.3.2】testbench修改之后再次仿真会报错',\n", " '[0.3.2] [问题] 含参数的 Verilog 模块自动例化,代码格式不正确',\n", " '【报错】RuntimeError: null function or function signature mismatch',\n", " '报错:verilog解析器无法解析以下代码',\n", " '报错:verilog解析器的bug',\n", " '功能建议-能增加点类似Verilog-Mode的功能么',\n", " '报错:RuntimeError: null function or function signature mismatch、无法识别HDL文件',\n", " '例化模块自动生成tb文件报错Unknown module type',\n", " 'Errors happen when parsing d:/danpj/fpga/modelsim/mod1/user/src/count4.v. Error: \"RuntimeError: null function or function signature mismatch\". Just propose a valuable issue in our github repo ',\n", " '自动例化报错',\n", " '【问题】【0.3.2】重复提示 Error: \"RuntimeError: null function or function signature mismatch\"',\n", " '【0.3.2】【问题】1无法解析localparam 2 带参数模块例化',\n", " '基础教程太少',\n", " '.v源文件未被正确识别',\n", " '重复仿真时报错',\n", " '插件不能使用',\n", " '关于netlist的生成错误',\n", " '[0.3.2] 支持对verilator 的dpi-c机制的支持 ',\n", " '[0.3.2] 离线支持+SV支持',\n", " 'Bad webstie connection on README',\n", " '在声明数据位宽时使用宏定义会报错',\n", " '0.3.2 无verilog语法检查,且提示RuntimeError',\n", " '[0.3.2] 模块定义跳转偶尔会出现问题',\n", " '[0.3.2] 代码补全有多个内容完全相同的选项',\n", " '[0.3.2] 例化模块的类型,模块名称的代码高亮不变色',\n", " '文档中的params和ports数反了',\n", " '[建议]:优化Formatter与文档生成',\n", " '[0.3.2] Linter(vivado) 启用无效 (还是说我用的vivado2023太新了?)',\n", " '[0.3.2]module的#后的parameter能悬停显示数值, 但内部parameter的不能',\n", " '[0.3.2]param语法错误会弹右下角报错弹窗, TreeView刷新按钮无效',\n", " '0.3.0版本后存在bug,构建项目后仿真无法运行',\n", " '建议:模块例化可以基于文件夹来检索,当文件比较多时更整洁一点。',\n", " 'Add Questa-Sim into the linter option',\n", " '悬停提示对 /**/ 型的注释有误',\n", " '仿真时因为文件夹名字存在空格产生错误',\n", " 'filetype from json to jsonc (support comments)',\n", " '语法识别错误',\n", " 'code to doc',\n", " 'WSL环境点击“显示当前文件的FSM图”时会发生扩展远程主机终止的错误',\n", " '在架构里不能解析include后的模块',\n", " 'treeview在文件移动时遇到问题',\n", " 'treeview在文件变动时产生错误',\n", " '[0.3.0 beta] iverilog指令错误',\n", " 'Will verilator be supported?',\n", " '[0.3.0 beta] 高亮颜色错误',\n", " '状态机显示有问题',\n", " 'xdc文件无法高亮显示',\n", " 'Ubuntu环境下,digital ide对配置环境有问题。',\n", " '[0.3.0 beta] \"定义跳转\"定义位置出错',\n", " '[0.3.0 beta] \"library导入文件\"模块解析报错',\n", " '[0.3.0 beta] \"对当前文件进行仿真\"功能报错',\n", " '[0.3.0 beta] 由于插件不能分析出使用了\"include\"语法, 从而导致Sim失败',\n", " '[0.3.0 beta] 文件跳转功能失效',\n", " '[0.3.0 beta] 例化中的[xx:xx]连线会导致后续颜色错误',\n", " '[0.3.0 beta]\"显示当前文件的netlist\"无法正确显示出网表',\n", " '[0.3.0 beta] Bitwidth of 1-bit signal is incorrectly recognized as \"Unknown\" in auto instantiation and auto document',\n", " '[0.3.3 beta] 含参数模型的例化存在问题以及拓展快捷键失效的问题',\n", " \"[0.3.2]数值悬停提示不支持'_'语法\"],\n", " [1,\n", " 2,\n", " 3,\n", " 5,\n", " 1,\n", " 5,\n", " 5,\n", " 2,\n", " 1,\n", " 1,\n", " 1,\n", " 3,\n", " 5,\n", " 1,\n", " 5,\n", " 1,\n", " 5,\n", " 1,\n", " 5,\n", " 5,\n", " 5,\n", " 5,\n", " 5,\n", " 1,\n", " 1,\n", " 2,\n", " 5,\n", " 5,\n", " 5,\n", " 1,\n", " 5,\n", " 4,\n", " 2,\n", " 2,\n", " 4,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 4,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 4,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 4,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2,\n", " 2])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "engine = PromptEngine('../config/story.yml')\n", "engine.merge_stories_from_yml('../config/github-issue.story.yml')\n", "\n", "sentences = []\n", "labels = []\n", "for story in engine.stories:\n", " sentences.append(story.message)\n", " labels.append(engine.intent2id[story.intent])\n", "sentences, labels" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(94, 768)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embedding = model.embed_documents(sentences)\n", "embedding = np.array(embedding)\n", "embedding.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "tsne = TSNE(n_components=2)\n", "plots = tsne.fit_transform(embedding)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "labels = np.array(labels)\n", "for label in set(labels):\n", " mask = labels == label\n", " cor_plots = plots[mask]\n", " plt.scatter(cor_plots[:, 0], cor_plots[:, 1], s=50, alpha=0.9, label=engine.id2intent[label])\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression()" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "log_model = LogisticRegression()\n", "log_model.fit(embedding, labels)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_sentence = ['咖啡喝不了,喝了胃不舒服']\n", "test_embedding = model.embed_documents(test_sentence)\n", "log_model.predict(test_embedding)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['../model/embedding_mapping.sklearn']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(log_model, '../model/embedding_mapping.sklearn')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "log_model = joblib.load('../model/embedding_mapping.sklearn')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_sentence = ['咖啡喝不了,喝了胃不舒服']\n", "test_embedding = model.embed_documents(test_sentence)\n", "log_model.predict(test_embedding)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SVC()" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.svm import SVC\n", "\n", "svm = SVC()\n", "svm.fit(embedding, labels)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['../model/embedding_mapping.sklearn']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# test_sentence = ['咖啡喝不了,喝了胃不舒服']\n", "# test_embedding = model.embed_documents(test_sentence)\n", "# svm.predict(test_embedding)\n", "# joblib.dump(log_model, '../model/embedding_mapping.sklearn')" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }