Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / app.py

pmkhanh7890

1st

22e1b62 11 months ago

raw

history blame

13.4 kB

	import warnings

	import torchvision.transforms as transforms
	from google_img_source_search import ReverseImageSearcher

	# from src.images.CNN_model_classifier import predict_cnn
	# from src.images.diffusion_model_classifier import (
	# ImageClassifier,
	# predict_single_image,
	# )

	warnings.simplefilter(
	action="ignore",
	category=FutureWarning,
	) # disable FutureWarning

	import gradio as gr # noqa: E402
	from transformers import ( # noqa: E402
	AutoModelForSequenceClassification,
	AutoTokenizer,
	pipeline,
	)

	from src.texts.MAGE.deployment import ( # noqa: E402
	detect,
	preprocess,
	)
	from src.texts.PASTED.pasted_lexicon import Detector # noqa: E402
	from src.texts.Search_Text.search import ( # noqa: E402
	get_important_sentences,
	get_keywords,
	is_human_written,
	)
	from src.images.Search_Image.search import (
	compare_images,
	get_image_from_path,
	get_image_from_url,
	)


	def convert_score_range(score):
	"""
	Converts a score from the range [0, 1] to [-1, 1].

	Args:
	score: The original score in the range [0, 1].

	Returns:
	The converted score in the range [-1, 1].
	"""

	return 2 * score - 1


	def generate_highlighted_text(text_scores):
	"""
	Generates a highlighted text string based on the given text and scores.

	Args:
	text_scores: A list of tuples, where each tuple contains a text
	segment and its score.

	Returns:
	A string of HTML code with highlighted text.
	"""
	highlighted_text = ""
	for text, score in text_scores:
	# Map score to a color using a gradient
	color = f"rgba(255, 0, 0, {1 - score})" # Red to green gradient
	highlighted_text += (
	f"<span style='background-color: {color}'>{text}</span>" # noqa
	)
	return highlighted_text


	def separate_characters_with_mask(text, mask):
	"""Separates characters in a string and pairs them with a mask sign.

	Args:
	text: The input string.

	Returns:
	A list of tuples, where each tuple contains a character and a mask.
	"""

	return [(char, mask) for char in text]


	def detect_ai_text(model_name, search_engine, text):
	if search_engine is True:
	keywords = get_keywords(text)
	important_sentences = get_important_sentences(text, keywords)
	predictions = is_human_written(important_sentences[0])
	print("keywords: ", keywords)
	print("important_sentences: ", important_sentences)
	print("predictions: ", predictions)
	if predictions == -1:
	caption = "[Found exact match] "
	text_scores = list(zip([caption, text], [0, predictions]))
	print("text_scores: ", text_scores)
	return text_scores

	if model_name == "SimLLM":
	tokenize_input = SimLLM_tokenizer(text, return_tensors="pt")
	outputs = SimLLM_model(**tokenize_input)
	predictions = outputs.logits.argmax(dim=-1).item()
	if predictions == 0:
	predictions = "human-written"
	else:
	predictions = "machine-generated"

	elif model_name == "MAGE":
	processed_text = preprocess(text)
	predictions = detect(
	processed_text,
	MAGE_tokenizer,
	MAGE_model,
	device,
	)

	elif model_name == "chatgpt-detector-roberta":
	predictions = roberta_pipeline_en(text)[0]["label"]
	if predictions == "Human":
	predictions = "human-written"
	else: # ChatGPT
	predictions = "machine-generated"
	elif model_name == "PASTED-Lexical":
	predictions = detector(text)

	if model_name != "PASTED-Lexical":
	text_scores = list(zip([text], [predictions]))
	else:
	text_scores = []
	for text, score in predictions:
	new_score = convert_score_range(score) # normalize score
	text_scores.append((text, new_score))

	return text_scores


	diffusion_model_path = (
	"src/images/Diffusion/model_checkpoints/"
	"image-classifier-step=7007-val_loss=0.09.ckpt"
	)
	cnn_model_path = "src/images/CNN/model_checkpoints/blur_jpg_prob0.5.pth"


	def detect_ai_image(input_image_path, search_engine):
	# if search_engine is True:
	# Search image

	rev_img_searcher = ReverseImageSearcher()
	search_items = rev_img_searcher.search_by_file(input_image_path)
	min_result_difference = 5000
	result_image_url = ""
	input_image = get_image_from_path(input_image_path)

	for search_item in search_items:
	# print(f'Title: {search_item.page_title}')
	# print(f'Site: {search_item.page_url}')
	# print(f'Img: {search_item.image_url}\n')

	# Compare each search result image with the input image
	result_image = get_image_from_url(search_item.image_url)
	# input_image = get_image_from_url(search_item.image_url)
	result_difference = compare_images(result_image, input_image)

	print(f"Difference with search result: {result_difference}")
	print(f"Result image url: {search_item.page_url}\n")

	if min_result_difference > result_difference:
	min_result_difference = result_difference
	result_image_url = search_item.image_url
	result_page_url = search_item.page_url


	if result_difference == 0:
	break


	if min_result_difference == 0:
	result = f"<h1>Input image is LIKELY SIMILAR to image from:</h1>"\
	f"<ul>"\
	f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
	f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
	f"<li>\nDifference score: {min_result_difference}</li>"\
	f"</ul>"
	elif 10 > min_result_difference > 0:
	result = f"<h1>Input image is potentially a VARIATRION from:</h1>"\
	f"<ul>"\
	f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
	f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
	f"<li>\nDifference score: {min_result_difference}</li>"\
	f"</ul>"
	elif min_result_difference < 5000:
	result = f"<h1>Input image is not similar to any search results.</h1>"\
	f"<ul>"\
	f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
	f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
	f"<li>\nDifference score: {min_result_difference}</li>"\
	f"</ul>"
	else:
	result = f"<h1>No search result found.</h1>"\

	return result

	# def get_prediction_diffusion(image):
	# model = ImageClassifier.load_from_checkpoint(diffusion_model_path)

	# prediction = predict_single_image(image, model)
	# return (prediction >= 0.5, prediction)

	# def get_prediction_cnn(image):
	# prediction = predict_cnn(image, cnn_model_path)
	# return (prediction >= 0.5, prediction)

	# # Define the transformations for the image
	# transform = transforms.Compose(
	# [
	# transforms.Resize((224, 224)), # Image size expected by ResNet50
	# transforms.ToTensor(),
	# transforms.Normalize(
	# mean=[0.485, 0.456, 0.406],
	# std=[0.229, 0.224, 0.225],
	# ),
	# ],
	# )
	# image_tensor = transform(inp)
	# pred_diff, prob_diff = get_prediction_diffusion(image_tensor)
	# pred_cnn, prob_cnn = get_prediction_cnn(image_tensor)
	# verdict = (
	# "AI Generated" if (pred_diff or pred_cnn) else "No GenAI detected"
	# )
	# return (
	# f"<h1>{verdict}</h1>"
	# f"<ul>"
	# f"<li>Diffusion detection score: {prob_diff:.1%} "
	# f"{'(MATCH)' if pred_diff else ''}</li>"
	# f"<li>CNN detection score: {prob_cnn:.1%} "
	# f"{'(MATCH)' if pred_cnn else ''}</li>"
	# f"</ul>"
	# )


	# Define GPUs
	device = "cpu" # use 'cuda:0' if GPU is available

	# init MAGE
	model_dir = "yaful/MAGE" # model in huggingface
	MAGE_tokenizer = AutoTokenizer.from_pretrained(model_dir)
	MAGE_model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(
	device,
	)

	# init chatgpt-detector-roberta
	model_dir = "Hello-SimpleAI/chatgpt-detector-roberta" # model in huggingface
	roberta_pipeline_en = pipeline(task="text-classification", model=model_dir)

	# init PASTED
	model_dir = "linzw/PASTED-Lexical"
	detector = Detector(model_dir, device)

	# init SimLLM
	model_path = "./models/single_model_detector"
	SimLLM_tokenizer = AutoTokenizer.from_pretrained(model_path)
	SimLLM_model = AutoModelForSequenceClassification.from_pretrained(model_path)

	# Init variable for UI
	title = """
	<center>

	<h1> AI-generated content detection </h1>
	<b> Demo by NICT & Tokyo Techies <b>

	</center>
	"""

	examples = [
	[
	"SimLLM",
	False,
	"""\
	The BBC's long-running consumer rights series Watchdog is to end as a \
	standalone programme, instead becoming part of The One Show. Watchdog \
	began in 1980 as a strand of Nationwide, but proved so popular it \
	became a separate programme in 1985. Co-host Steph McGovern has moved \
	to Channel 4, but Matt Allwright and Nikki Fox will stay to front the \
	new strand. The BBC said they would investigate viewer complaints all \
	year round rather than for two series a year.
	""",
	],
	[
	"chatgpt-detector-roberta",
	False,
	"""\
	Artificial intelligence (AI) is the science of making machines \
	intelligent. It enables computers to learn from data, recognize \
	patterns, and make decisions. AI powers many technologies we use \
	daily, from voice assistants to self-driving cars. It's rapidly \
	evolving, promising to revolutionize various industries and reshape \
	the future.""",
	],
	]

	model_remark = """<left>
	Model sources:
	<a href="https://github.com/Tokyo-Techies/prj-nict-ai-content-detection">SimLLM</a>,
	<a href="https://github.com/yafuly/MAGE">MAGE</a>,
	<a href="https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta">chatgpt-detector-roberta</a>,
	<a href="https://github.com/Linzwcs/PASTED">PASTED-Lexical</a>.
	</left>
	""" # noqa: E501

	image_samples = [
	["src/images/samples/fake_dalle.jpg", "Generated (Dall-E)"],
	["src/images/samples/fake_midjourney.png", "Generated (MidJourney)"],
	["src/images/samples/fake_stable.jpg", "Generated (Stable Diffusion)"],
	["src/images/samples/fake_cnn.png", "Generated (GAN)"],
	["src/images/samples/real.png", "Organic"],
	[
	"https://p.potaufeu.asahi.com/1831-p/picture/27695628/89644a996fdd0cfc9e06398c64320fbe.jpg", # noqa E501
	"Internet GenAI",
	],
	]
	image_samples_path = [i[0] for i in image_samples]

	# UI
	with gr.Blocks() as demo:
	with gr.Row():
	gr.HTML(title)
	with gr.Row():
	with gr.Tab("Text"):
	with gr.Row():
	with gr.Column():
	model = gr.Dropdown(
	[
	"SimLLM",
	"MAGE",
	"chatgpt-detector-roberta",
	"PASTED-Lexical",
	],
	label="Detection model",
	)
	search_engine = gr.Checkbox(label="Use search engine")
	gr.HTML(model_remark)
	with gr.Column():
	text_input = gr.Textbox(
	label="Input text",
	placeholder="Enter text here...",
	lines=5,
	)

	output = gr.HighlightedText(
	label="Detection results",
	combine_adjacent=True,
	show_legend=True,
	color_map={
	"human-written": "#7d58cf",
	"machine-generated": "#e34242",
	},
	)

	gr.Examples(
	examples=examples,
	inputs=[model, search_engine, text_input],
	)
	model.change(
	detect_ai_text,
	inputs=[model, search_engine, text_input],
	outputs=output,
	)
	search_engine.change(
	detect_ai_text,
	inputs=[model, search_engine, text_input],
	outputs=output,
	)
	text_input.change(
	detect_ai_text,
	inputs=[model, search_engine, text_input],
	outputs=output,
	)
	with gr.Tab("Images"):
	with gr.Row():
	input_image = gr.Image(type="filepath")
	with gr.Column():
	output_image = gr.Markdown(height=400)
	gr.Examples(
	examples=image_samples,
	inputs=input_image,
	)

	input_image.change(
	detect_ai_image,
	inputs=input_image,
	outputs=output_image,
	)


	# demo.launch(share=True)
	demo.launch(allowed_paths=image_samples_path, share=True)