以下为卖家选择提供的数据验证报告:
数据描述
This data was reformatted from "Midjourney User Prompts & Generated Images (250k)". Give them a ^ +1 and this as well. They have a way better intro. https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage
TLDR 250k.csv (248k rows) = Good for text search to show images. Contains re-runs, which could have similar output images. reduced.csv (130k rows) = No re-runs, but may contain duplicate text with different arguments. raw.csv (251k rows) = messages containing commands and probably unwanted content. URLs are full length. img_url is the right end of the image URLs. text is the text portion of the /imagine command, excluding arguments and input URLs.
wc -l * 251390 midjourney_2022_250k_raw.csv 248069 midjourney_2022_250k.csv 130407 midjourney_2022_reduced.csv
The URLs have this removed to save memory. Re-attach like so... image: https://cdn.discordapp.com/attachments/ + img_url
image: https://media.discordapp.net/attachments/ + img_url
job page: https://www.midjourney.com/app/jobs/ + job_id
The raw version contains errors, chat, server messages, and other anomalies. It's not recommended unless you want to start over.
- Mistake: "thumbnail" is just another URL.
Probably all the code you need.
import os,sys,re,glob,json import pandas as pd import numpy as np import glob from tqdm.notebook import tqdm def extract_data(filename, ii=0): with open(filename) as fh: obj = json.loads(fh.read()) columns = [ # 'id', 'type', 'content', 'timestamp', # 'channel_id', 'author', 'attachments', # 'embeds', 'mentions', 'mention_roles', 'pinned', 'mention_everyone', 'tts', 'edited_timestamp', 'flags', 'components', 'message_reference', 'hit' ] data = [] for xx_lst in obj['messages']: for d_yy in xx_lst[:1]: zz_lst = d_yy.get('attachments',[]) content = d_yy.get('content',None) timestamp = d_yy.get('timestamp','') for d_attach in zz_lst: thumb_url = d_attach.get('url','') img_url = d_attach.get('proxy_url','') data.append([timestamp,content, thumb_url, img_url]) ii += 1 return data,ii def expand_df(df): df.drop_duplicates('img_url', inplace=True) df['L'] = df._message.apply(len) df = df[df.L>0] def func(x): if '*' not in x: return None x = x.replace('\n',' ') arr = re.findall(r'[*]{2,20}(.+)[*]{2,20}', x) cmd = None if len(arr)>0: cmd = arr[0].replace('—','--') return cmd df['cmd'] = df._message.apply(func) df = df[~df['cmd'].isnull()] def func(x): return x.replace('https://media.discordapp.net/attachments/','') df['img_url'] = df.img_url.apply(func) df = df[df.img_url.str.contains('png')] def func(x): job_ids = re.findall('[a-f0-9-]{36,36}',x) return job_ids[0] if len(job_ids)>0 else None df['job_id'] = df._thumb_url.apply(func) df.drop_duplicates('job_id', inplace=True) def func(x): x = re.sub(r'[<](.+)[>]', '', x) return x.split('--')[0].strip() df['text'] = df['cmd'].apply(func) return df.sample(frac=1.0) files = glob.glob(f'{DATA_DIR}*.json') ii = 0 data = [] for filename in tqdm(files): dd,ii = extract_data(filename, ii) data += dd columns = 'timestamp,_message,_thumb_url,img_url'.split(',') df = pd.DataFrame(data, columns=columns) df.to_csv(BASE_DIR+'midjourney_2022_250k_raw.csv') df = expand_df(df) df = df[[c for c in df.columns if len(c)>0 and str(c)[0]!='_']] df.to_csv(BASE_DIR+'midjourney_2022_250k.csv') df = df.sample(frac=1.0).drop_duplicates(['cmd']) df.sort_values('L', ascending=True, inplace=True) df.to_csv(BASE_DIR+'midjourney_2022_reduced.csv')
