1 Commits

Author SHA1 Message Date
Luck
ddc2030a42 language detection 2024-08-24 10:31:50 +01:00
8 changed files with 151 additions and 34 deletions

View File

@@ -1,6 +1,8 @@
import { useEffect, useState } from 'react';
import Editor from './components/Editor';
import { loadFromBytebin } from './util/storage';
import { Language } from './util/language';
import { detectLanguage } from './util/detect-language';
const INITIAL = Symbol();
const LOADING = Symbol();
@@ -13,7 +15,7 @@ export default function App() {
const [state, setState] = useState<LoadingState>(INITIAL);
const [forcedContent, setForcedContent] = useState<string>('');
const [actualContent, setActualContent] = useState<string>('');
const [contentType, setContentType] = useState<string>();
const [contentType, setContentType] = useState<Language>();
function setContent(content: string) {
setActualContent(content);
@@ -28,8 +30,14 @@ export default function App() {
loadFromBytebin(pasteId).then(({ ok, content, type }) => {
if (ok) {
setContent(content);
if (type) {
if (type !== 'plain') {
setContentType(type);
} else {
detectLanguage(pasteId).then(detectedLanguage => {
if (detectedLanguage) {
setContentType(detectedLanguage);
}
});
}
} else {
setContent(get404Message(pasteId));

View File

@@ -7,12 +7,13 @@ import themes, { Themes } from '../style/themes';
import EditorControls from './EditorControls';
import EditorGlobalStyle from './EditorGlobalStyle';
import EditorTextArea from './EditorTextArea';
import { Language } from '../util/language';
export interface EditorProps {
forcedContent: string;
actualContent: string;
setActualContent: (value: string) => void;
contentType?: string;
contentType?: Language;
pasteId?: string;
}

View File

@@ -4,7 +4,7 @@ import { MutableRefObject, useCallback, useEffect, useState } from 'react';
import styled from 'styled-components';
import themes, { Themes } from '../style/themes';
import { languages } from '../util/highlighting';
import { languages, unknownLanguage } from '../util/language';
import { saveToBytebin } from '../util/storage';
import Button from './Button';
import { ResetFunction } from './Editor';
@@ -104,9 +104,9 @@ export default function EditorControls({
</Button>
<MenuButton
label="language"
value={language}
value={language === unknownLanguage ? '?' : language}
setValue={setLanguage}
ids={languages}
ids={languages as unknown as Record<string, string[]>}
/>
{readOnly && <Button onClick={unsetReadOnly}>[edit]</Button>}
</Section>

View File

@@ -2,3 +2,7 @@ export const bytebinUrl =
process.env.REACT_APP_BYTEBIN_URL || 'https://bytebin.lucko.me/';
export const postUrl = bytebinUrl + 'post';
export const languageDetectionUrl =
process.env.REACT_APP_LANG_DETECT_URL ||
'https://language-detection-service.pastes.dev/';

View File

@@ -0,0 +1,49 @@
import { languageDetectionUrl } from './constants';
import { Language } from './language';
interface DetectedLanguage {
languageId: string;
confidence: number;
}
export async function detectLanguage(id: string): Promise<Language | null> {
try {
const resp = await fetch(languageDetectionUrl + id);
if (resp.ok) {
const results = (await resp.json()) as DetectedLanguage[];
for (const { languageId, confidence } of results) {
if (confidence > 0.5 && lookup[languageId]) {
return lookup[languageId];
}
}
}
} catch (e) {}
return null;
}
const lookup: Record<string, Language> = {
ini: 'log', // the model seems to confidently guess log files as ini - log is the more likely option
yaml: 'yaml',
md: 'markdown',
rb: 'ruby',
kt: 'kotlin',
xml: 'xml',
js: 'javascript',
html: 'html',
ts: 'typescript',
json: 'json',
php: 'php',
py: 'python',
rs: 'rust',
sql: 'sql',
sh: 'shell',
cpp: 'cpp',
go: 'go',
scala: 'scala',
dockerfile: 'dockerfile',
java: 'java',
cs: 'csharp',
css: 'css',
groovy: 'java',
};
// missing: csv, ml, ex, pas, bat, lua, groovy, v, jl, pm, prolog, matlab, clj, f90, c, tex, coffee, ps1, hs, mm, cmake, erl, dm, dart, asm, makefile, r, swift, lisp, vba, toml, cbl

View File

@@ -1,23 +0,0 @@
export const languages = {
text: ['plain', 'log'],
config: ['yaml', 'json', 'xml', 'ini'],
code: [
'java',
'javascript',
'typescript',
'python',
'kotlin',
'scala',
'cpp',
'csharp',
'shell',
'ruby',
'rust',
'sql',
'go',
],
web: ['html', 'css', 'scss', 'php', 'graphql'],
misc: ['dockerfile', 'markdown', 'proto'],
};
export const languageIds = Object.values(languages).flat(1);

70
src/util/language.ts Normal file
View File

@@ -0,0 +1,70 @@
export type Language =
| 'plain'
| 'plaintext'
| 'log'
| 'yaml'
| 'json'
| 'xml'
| 'ini'
| 'java'
| 'javascript'
| 'typescript'
| 'python'
| 'kotlin'
| 'scala'
| 'cpp'
| 'csharp'
| 'shell'
| 'ruby'
| 'rust'
| 'sql'
| 'go'
| 'html'
| 'css'
| 'scss'
| 'php'
| 'graphql'
| 'dockerfile'
| 'markdown'
| 'proto';
export const unknownLanguage: Language & 'plain' = 'plain';
export interface Languages {
text: Language[];
config: Language[];
code: Language[];
web: Language[];
misc: Language[];
}
export const languages: Languages = {
text: ['plaintext', 'log'],
config: ['yaml', 'json', 'xml', 'ini'],
code: [
'java',
'javascript',
'typescript',
'python',
'kotlin',
'scala',
'cpp',
'csharp',
'shell',
'ruby',
'rust',
'sql',
'go',
],
web: ['html', 'css', 'scss', 'php', 'graphql'],
misc: ['dockerfile', 'markdown', 'proto'],
};
export const languageIds: Language[] = [
...Object.values(languages).flat(1),
unknownLanguage,
];
export function isLanguage(lang: string): lang is Language {
return languageIds.includes(lang as Language);
}

View File

@@ -1,12 +1,12 @@
import { gzip } from 'pako';
import MIMEType from 'whatwg-mimetype';
import { bytebinUrl, postUrl } from './constants';
import { languageIds } from './highlighting';
import { isLanguage, Language } from './language';
interface LoadResultSuccess {
ok: true;
content: string;
type?: string;
type?: Language;
}
interface LoadResultFail {
@@ -64,13 +64,21 @@ export async function saveToBytebin(
return null;
}
export function contentTypeToLanguage(contentType: string) {
export function contentTypeToLanguage(
contentType: string
): Language | undefined {
const { type, subtype: subType } = new MIMEType(contentType);
if (type === 'application' && subType === 'json') {
return 'json';
}
if (type === 'text' && languageIds.includes(subType.toLowerCase())) {
return subType.toLowerCase();
let subTypeLower = subType.toLowerCase();
if (subTypeLower.startsWith('x-')) {
subTypeLower = subTypeLower.substring(2);
}
if (type === 'text' && isLanguage(subTypeLower)) {
return subTypeLower;
}
}