摘要:This is the official implementation for Gaze-LLE, a transformer approach for estimating gaze targets that leverages the power of p
github地址:https://github.com/fkryan/gazelle
This is the official implementation for Gaze-LLE, a transformer approach for estimating gaze targets that leverages the power of pretrained visual foundation models. Gaze-LLE provides a streamlined gaze architecture that learns only a lightweight gaze decoder on top of a frozen, pretrained visual encoder (DINOv2). Gaze-LLE learns 1-2 orders of magnitude fewer parameters than prior works and doesn't require any extra input modalities like depth and pose!
det_face.onnx
Model PropertiesInputs
name:input.1
tensor:Float[1, 3, -1, -1]
Outputs
name:448
tensor:Float[12800, 1]
name:471
tensor:Float[3200, 1]
name:494
tensor:Float[800, 1]
name:451
tensor:Float[12800, 4]
name:474
tensor:Float[3200, 4]
name:497
tensor:Float[800, 4]
name:454
tensor:Float[12800, 10]
name:477
tensor:Float[3200, 10]
name:500
tensor:Float[800, 10]
gazelle_dinov2_vitl14_inout_1x3x448x448_1xNx4.onnx
Model PropertiesInputs
name:image_bgr
tensor:Float[1, 3, 448, 448]
name:bboxes_x1y1x2y2
tensor:Float[1, -1, 4]
Outputs
name:heatmap
tensor:Float[-1, 64, 64]
name:inout
tensor:Float[-1]
Form1.cs
using OpenCvSharp;using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Imaging;
using System.Windows.Forms;
namespace Onnx_Demo
{
public partial class Form1 : Form
{
publicForm1
{
InitializeComponent;
}
string fileFilter ="*.*|*.bmp;*.jpg;*.jpeg;*.tiff;*.tiff;*.png";
string image_path ="";
DateTime dt1 = DateTime.Now;
DateTime dt2 = DateTime.Now;
Mat image;
Mat result_image;
FaceDet face_det;
GazeLLE gazelle;
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog ofd = new OpenFileDialog;
ofd.Filter = fileFilter;
if(ofd.ShowDialog != DialogResult.OK)return;
pictureBox1.Image = ;
image_path = ofd.FileName;
pictureBox1.Image = new Bitmap(image_path);
textBox1.Text ="";
image = new Mat(image_path);
pictureBox2.Image = ;
}
private void button2_Click(object sender, EventArgs e)
{
if(image_path =="")
{
return;
}
button2.Enabled =false;
Application.DoEvents;
result_image = image.Clone;
dt1 = DateTime.Now;
List head_boxes = face_det.Detect(image);
foreach (var iteminhead_boxes)
{
Rect rect = Rect.FromLTRB((int)item.xmin, (int)item.ymin, (int)item.xmax, (int)item.ymax);
Cv2.Rectangle(result_image, rect, Scalar.Red);
}
List resized_heatmaps = gazelle.Predict(image, head_boxes);
dt2 = DateTime.Now;
DrawGaze(result_image, head_boxes, resized_heatmaps);
pictureBox2.Image = new Bitmap(result_image.ToMemoryStream);
textBox1.Text ="推理耗时:"+ (dt2 - dt1).TotalMilliseconds +"ms";
button2.Enabled =true;
}
void DrawGaze(Mat frame, List head_boxes, List heatmaps,floatthr = 0.0f)
{
int num_box = head_boxes.Count;
for(int i = 0; i < num_box; i++)
{
double max_score;
OpenCvSharp.Point classIdPoint;
double minVal;
OpenCvSharp.Point minLoc;
Cv2.MinMaxLoc(heatmaps[i], out minVal, out max_score, out minLoc, out classIdPoint);
int cx = classIdPoint.X;
int cy = classIdPoint.Y;
if(max_score >= thr)
{
int head_cx = (int)((head_boxes[i].xmin + head_boxes[i].xmax) * 0.5);
int head_cy = (int)((head_boxes[i].ymin + head_boxes[i].ymax) * 0.5);
Cv2.ArrowedLine(frame, new OpenCvSharp.Point(head_cx, head_cy), new OpenCvSharp.Point(cx, cy), new Scalar(0, 255, 0), 2, LineTypes.AntiAlias);
}
}
}
private void Form1_Load(object sender, EventArgs e)
{
//onnx文件,链接: https://pan.baidu.com/s/1aBl4IALa38HvhMCFZcF8Wg 提取码: kjw1
face_det = new FaceDet("model\\det_face.onnx");
gazelle = new GazeLLE("model\\gazelle_dinov2_vitl14_inout_1x3x448x448_1xNx4.onnx");
image_path ="test_img\\1.jpg";
}
private void button3_Click(object sender, EventArgs e)
{
if(pictureBox2.Image == )
{
return;
}
Bitmap output = new Bitmap(pictureBox2.Image);
SaveFileDialog sdf = new SaveFileDialog;
sdf.Title ="保存";
sdf.Filter ="Images (*.jpg)|*.jpg|Images (*.png)|*.png|Images (*.bmp)|*.bmp|Images (*.emf)|*.emf|Images (*.exif)|*.exif|Images (*.gif)|*.gif|Images (*.ico)|*.ico|Images (*.tiff)|*.tiff|Images (*.wmf)|*.wmf";
if(sdf.ShowDialog == DialogResult.OK)
{
switch (sdf.FilterIndex)
{
case1:
{
output.Save(sdf.FileName, ImageFormat.Jpeg);
break;
}
case2:
{
output.Save(sdf.FileName, ImageFormat.Png);
break;
}
case3:
{
output.Save(sdf.FileName, ImageFormat.Bmp);
break;
}
case4:
{
output.Save(sdf.FileName, ImageFormat.Emf);
break;
}
case5:
{
output.Save(sdf.FileName, ImageFormat.Exif);
break;
}
case6:
{
output.Save(sdf.FileName, ImageFormat.Gif);
break;
}
case7:
{
output.Save(sdf.FileName, ImageFormat.Icon);
break;
}
case8:
{
output.Save(sdf.FileName, ImageFormat.Tiff);
break;
}
case9:
{
output.Save(sdf.FileName, ImageFormat.Wmf);
break;
}
}
MessageBox.Show("保存成功,位置:"+ sdf.FileName);
}
}
}
}
FaceDet.cs
using Microsoft.ML.OnnxRuntime;using Microsoft.ML.OnnxRuntime.Tensors;
using OpenCvSharp;
using System.Collections.Generic;
using System.Linq;
namespace Onnx_Demo
{
internal class FaceDet
{
InferenceSession onnx_session;
int input_size = 512;
floatdet_thresh = 0.5f;
int fmc = 3;
int feat_stride_fpn = new int { 8, 16, 32 };
floatnms_thresh = 0.4f;
float input_image;
floatdet_scale;
public FaceDet(string model_path)
{
SessionOptions options = new SessionOptions;
options.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_INFO;
options.AppendExecutionProvider_CPU(0);
onnx_session = new InferenceSession(model_path, options);
}
public void Preprocess(Mat srcimg)
{
floatim_ratio = (float)(srcimg.Rows / (float)srcimg.Cols);
int new_width = input_size;
int new_height = (int)(new_width * im_ratio);
if(im_ratio > 1)
{
new_height = input_size;
new_width = (int)(new_height / im_ratio);
}
det_scale = (float)((float)new_height / srcimg.Rows);
Mat resized_img = new Mat;
Cv2.Resize(srcimg, resized_img, new Size(new_width, new_height));
Mat det_img = new Mat;
Cv2.CopyMakeBorder(resized_img, det_img, 0, input_size - new_height, 0, input_size - new_width, BorderTypes.Constant, 0);
Mat bgrChannels = Cv2.Split(det_img);
for(int c = 0; c < 3; c++)
{
bgrChannels[c].ConvertTo(bgrChannels[c], MatType.CV_32FC1, 1 / 128.0, -127.5 / 128.0);
}
Cv2.Merge(bgrChannels, det_img);
foreach (Mat channelinbgrChannels)
{
channel.Dispose;
}
input_image = Common.ExtractMat(det_img);
det_img.Dispose;
resized_img.Dispose;
}
void GenerateProposal(float p_box,float p_scores,float p_kps, int stride, List boxes)
{
int feat_h = input_size / stride;
int feat_w = input_size / stride;
int num_anchors = 2;
for(int i = 0; i < feat_h; i++)
{
for(int j = 0; j < feat_w; j++)
{
for(int n = 0; n < num_anchors; n++)
{
int index = i * feat_w * num_anchors + j * num_anchors + n;
if(p_scores[index] >= det_thresh)
{
Bbox box = new Bbox;
box.xmin = (j - p_box[index * 4]) * stride;
box.ymin = (i - p_box[index * 4 + 1]) * stride;
box.xmax = (j + p_box[index * 4 + 2]) * stride;
box.ymax = (i + p_box[index * 4 + 3]) * stride;
box.xmin /= det_scale;
box.ymin /= det_scale;
box.xmax /= det_scale;
box.ymax /= det_scale;
for(int k = 0; k < 5; k++)
{
floatpx = (j + p_kps[index * 10 + k * 2]) * stride;
floatpy = (i + p_kps[index * 10 + k * 2 + 1]) * stride;
px /= det_scale;
py /= det_scale;
box.kps[k * 2] = px;
box.kps[k * 2 + 1] = py;
}
box.score = p_scores[index];
boxes.Add(box);
}
}
}
}
}
public List Detect(Mat srcimg)
{
Preprocess(srcimg);
Tensorfloat> input_tensor = new DenseTensorfloat>(input_image, new { 1, 3, input_size, input_size });
List input_container = new List
{
NamedOnnxValue.CreateFromTensor("input.1", input_tensor)
};
var ort_outputs = onnx_session.Run(input_container).ToArray;
List boxes = new List;
for(int i = 0; i < 3; i++)
{
float p_scores = ort_outputs[i].AsTensorfloat>.ToArray;
float p_bbox = ort_outputs[i + fmc].AsTensorfloat>.ToArray;
float p_kps = ort_outputs[i + fmc * 2].AsTensorfloat>.ToArray;
GenerateProposal(p_bbox, p_scores, p_kps, feat_stride_fpn[i], boxes);
}
Common.NMSBoxes(boxes, nms_thresh);
for(int i = 0; i < boxes.Count; i++)
{
floatpadw = (float)((boxes[i].xmax - boxes[i].xmin) * 0.15);
floatpadh = (float)((boxes[i].ymax - boxes[i].ymin) * 0.15);
boxes[i].xmin -= padw;
boxes[i].ymin -= padh;
boxes[i].xmax += padw;
boxes[i].ymax += padh;
}
returnboxes;
}
}
}
GazeLLE.cs
using Microsoft.ML.OnnxRuntime;using Microsoft.ML.OnnxRuntime.Tensors;
using OpenCvSharp;
using System;
using System.Collections.Generic;
using System.Linq;
namespace Onnx_Demo
{
internal class GazeLLE
{
InferenceSession onnx_session;
int input_size = 448;
float input_image;
float head_boxes_xyxy_norm;
public GazeLLE(string model_path)
{
SessionOptions options = new SessionOptions;
options.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_INFO;
options.AppendExecutionProvider_CPU(0);
onnx_session = new InferenceSession(model_path, options);
}
void Preprocess(Mat img)
{
Mat resized_img = new Mat;
Cv2.Resize(img, resized_img, new Size(input_size, input_size));
Mat bgrChannels = Cv2.Split(resized_img);
for(int c = 0; c < 3; c++)
{
bgrChannels[c].ConvertTo(bgrChannels[c], MatType.CV_32FC1);
}
Cv2.Merge(bgrChannels, resized_img);
foreach (Mat channelinbgrChannels)
{
channel.Dispose;
}
input_image = Common.ExtractMat(resized_img);
resized_img.Dispose;
}
public List Predict(Mat srcimg, List head_boxes)
{
floatimg_h = (float)srcimg.Rows;
floatimg_w = (float)srcimg.Cols;
Preprocess(srcimg);
int num_box = head_boxes.Count;
int input_head_boxes_shape = { 1, num_box, 4 }; ////不考虑batchsize,一直等于1
head_boxes_xyxy_norm = newfloat[1 * num_box * 4];
for(int i = 0; i < num_box; i++)
{
head_boxes_xyxy_norm[i * 4] = head_boxes[i].xmin / img_w;
head_boxes_xyxy_norm[i * 4 + 1] = head_boxes[i].ymin / img_h;
head_boxes_xyxy_norm[i * 4 + 2] = head_boxes[i].xmax / img_w;
head_boxes_xyxy_norm[i * 4 + 3] = head_boxes[i].ymax / img_h;
}
Tensorfloat> input_tensor_image_bgr = new DenseTensorfloat>(input_image, new { 1, 3, input_size, input_size });
Tensorfloat> input_tensor_bboxes_x1y1x2y2 = new DenseTensorfloat>(head_boxes_xyxy_norm, input_head_boxes_shape);
List input_container = new List
{
NamedOnnxValue.CreateFromTensor("image_bgr", input_tensor_image_bgr),
"bboxes_x1y1x2y2", input_tensor_bboxes_x1y1x2y2)
};
var ort_outputs = onnx_session.Run(input_container).ToArray;
float pdata = ort_outputs[0].AsTensorfloat>.ToArray;
int out_shape = ort_outputs[0].AsTensorfloat>.Dimensions.ToArray;
int num_map = out_shape[0];
List resized_heatmaps = new List;
for(int i = 0; i < num_map; i++)
{
resized_heatmaps.Add(new Mat);
}
int image_area = out_shape[1] * out_shape[2];
for
{
float data = newfloat[image_area];
Array.Copy(pdata, i * image_area, data, 0, image_area);
Mat heatmap = new Mat(out_shape[1], out_shape[2], MatType.CV_32FC1, data);
Cv2.Resize(heatmap, resized_heatmaps[i], new Size(srcimg.Cols, srcimg.Rows));
//Cv2.ImShow(i.ToString, resized_heatmaps[i]);
}
//float pdata1 = ort_outputs[1].AsTensorfloat>.ToArray;
//postprocess
//不做disable_attention_heatmap_mode,画出眼睛注视的线段更重要
returnresized_heatmaps;
}
}
}
来源:opendotnet
免责声明:本站系转载,并不代表本网赞同其观点和对其真实性负责。如涉及作品内容、版权和其它问题,请在30日内与本站联系,我们将在第一时间删除内容!